diff --git a/llvm/include/llvm/CodeGen/ExpandFp.h b/llvm/include/llvm/CodeGen/ExpandFp.h index c13119a4238ef..113ea4ae47ef7 100644 --- a/llvm/include/llvm/CodeGen/ExpandFp.h +++ b/llvm/include/llvm/CodeGen/ExpandFp.h @@ -10,6 +10,7 @@ #define LLVM_CODEGEN_EXPANDFP_H #include "llvm/IR/PassManager.h" +#include "llvm/Support/CodeGen.h" namespace llvm { @@ -18,11 +19,15 @@ class TargetMachine; class ExpandFpPass : public PassInfoMixin { private: const TargetMachine *TM; + CodeGenOptLevel OptLevel; public: - explicit ExpandFpPass(const TargetMachine *TM_) : TM(TM_) {} + explicit ExpandFpPass(const TargetMachine *TM, CodeGenOptLevel OptLevel); PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + + void printPipeline(raw_ostream &OS, + function_ref MapClassName2PassName); }; } // end namespace llvm diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index 095a40ecff289..593308150dc82 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -550,6 +550,9 @@ LLVM_ABI FunctionPass *createCFIFixup(); /// Creates CFI Instruction Inserter pass. \see CFIInstrInserter.cpp LLVM_ABI FunctionPass *createCFIInstrInserter(); +// Expands floating point instructions. +FunctionPass *createExpandFpPass(CodeGenOptLevel); + /// Creates CFGuard longjmp target identification pass. /// \see CFGuardLongjmp.cpp LLVM_ABI FunctionPass *createCFGuardLongjmpPass(); diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index 97c3ff869edf4..6a241f55245c7 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -737,7 +737,7 @@ void CodeGenPassBuilder::addISelPasses( addPass(PreISelIntrinsicLoweringPass(&TM)); addPass(ExpandLargeDivRemPass(&TM)); - addPass(ExpandFpPass(&TM)); + addPass(ExpandFpPass(&TM, getOptLevel())); derived().addIRPasses(addPass); derived().addCodeGenPrepare(addPass); diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp index 1c1047c1ce188..adebd704791ee 100644 --- a/llvm/lib/CodeGen/ExpandFp.cpp +++ b/llvm/lib/CodeGen/ExpandFp.cpp @@ -16,18 +16,29 @@ #include "llvm/CodeGen/ExpandFp.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/SimplifyQuery.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" +#include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/RuntimeLibcalls.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include + +#define DEBUG_TYPE "expand-fp" using namespace llvm; @@ -37,6 +48,359 @@ static cl::opt cl::desc("fp convert instructions on integers with " "more than bits are expanded.")); +namespace { +/// This class implements a precise expansion of the frem instruction. +/// The generated code is based on the fmod implementation in the AMD device +/// libs. +class FRemExpander { + /// The IRBuilder to use for the expansion. + IRBuilder<> &B; + + /// Floating point type of the return value and the arguments of the FRem + /// instructions that should be expanded. + Type *FremTy; + + /// Floating point type to use for the computation. This may be + /// wider than the \p FremTy. + Type *ComputeFpTy; + + /// Integer type used to hold the exponents returned by frexp. + Type *ExTy; + + /// How many bits of the quotient to compute per iteration of the + /// algorithm, stored as a value of type \p ExTy. + Value *Bits; + + /// Constant 1 of type \p ExTy. + Value *One; + +public: + static bool canExpandType(Type *Ty) { + // TODO The expansion should work for other floating point types + // as well, but this would require additional testing. + return Ty->isIEEELikeFPTy() && !Ty->isBFloatTy() && !Ty->isFP128Ty(); + } + + static FRemExpander create(IRBuilder<> &B, Type *Ty) { + assert(canExpandType(Ty)); + + // The type to use for the computation of the remainder. This may be + // wider than the input/result type which affects the ... + Type *ComputeTy = Ty; + // ... maximum number of iterations of the remainder computation loop + // to use. This value is for the case in which the computation + // uses the same input/result type. + unsigned MaxIter = 2; + + if (Ty->isHalfTy()) { + // Use the wider type and less iterations. + ComputeTy = B.getFloatTy(); + MaxIter = 1; + } + + unsigned Precision = + llvm::APFloat::semanticsPrecision(Ty->getFltSemantics()); + return FRemExpander{B, Ty, Precision / MaxIter, ComputeTy}; + } + + /// Build the FRem expansion for the numerator \p X and the + /// denumerator \p Y. The type of X and Y must match \p FremTy. The + /// code will be generated at the insertion point of \p B and the + /// insertion point will be reset at exit. + Value *buildFRem(Value *X, Value *Y, std::optional &SQ) const; + + /// Build an approximate FRem expansion for the numerator \p X and + /// the denumerator \p Y at the insertion point of builder \p B. + /// The type of X and Y must match \p FremTy. + Value *buildApproxFRem(Value *X, Value *Y) const; + +private: + FRemExpander(IRBuilder<> &B, Type *FremTy, unsigned Bits, Type *ComputeFpTy) + : B(B), FremTy(FremTy), ComputeFpTy(ComputeFpTy), ExTy(B.getInt32Ty()), + Bits(ConstantInt::get(ExTy, Bits)), One(ConstantInt::get(ExTy, 1)) {}; + + Value *createRcp(Value *V, const Twine &Name) const { + // Leave it to later optimizations to turn this into an rcp + // instruction if available. + return B.CreateFDiv(ConstantFP::get(ComputeFpTy, 1.0), V, Name); + } + + // Helper function to build the UPDATE_AX code which is common to the + // loop body and the "final iteration". + Value *buildUpdateAx(Value *Ax, Value *Ay, Value *Ayinv) const { + // Build: + // float q = rint(ax * ayinv); + // ax = fma(-q, ay, ax); + // int clt = ax < 0.0f; + // float axp = ax + ay; + // ax = clt ? axp : ax; + Value *Q = B.CreateUnaryIntrinsic(Intrinsic::rint, B.CreateFMul(Ax, Ayinv), + {}, "q"); + Value *AxUpdate = B.CreateFMA(B.CreateFNeg(Q), Ay, Ax, {}, "ax"); + Value *Clt = B.CreateFCmp(CmpInst::FCMP_OLT, AxUpdate, + ConstantFP::getZero(ComputeFpTy), "clt"); + Value *Axp = B.CreateFAdd(AxUpdate, Ay, "axp"); + return B.CreateSelect(Clt, Axp, AxUpdate, "ax"); + } + + /// Build code to extract the exponent and mantissa of \p Src. + /// Return the exponent minus one for use as a loop bound and + /// the mantissa taken to the given \p NewExp power. + std::pair buildExpAndPower(Value *Src, Value *NewExp, + const Twine &ExName, + const Twine &PowName) const { + // Build: + // ExName = frexp_exp(Src) - 1; + // PowName = fldexp(frexp_mant(ExName), NewExp); + Type *Ty = Src->getType(); + Type *ExTy = B.getInt32Ty(); + Value *Frexp = B.CreateIntrinsic(Intrinsic::frexp, {Ty, ExTy}, Src); + Value *Mant = B.CreateExtractValue(Frexp, {0}); + Value *Exp = B.CreateExtractValue(Frexp, {1}); + + Exp = B.CreateSub(Exp, One, ExName); + Value *Pow = B.CreateLdexp(Mant, NewExp, {}, PowName); + + return {Pow, Exp}; + } + + /// Build the main computation of the remainder for the case in which + /// Ax > Ay, where Ax = |X|, Ay = |Y|, and X is the numerator and Y the + /// denumerator. Add the incoming edge from the computation result + /// to \p RetPhi. + void buildRemainderComputation(Value *AxInitial, Value *AyInitial, Value *X, + PHINode *RetPhi, FastMathFlags FMF) const { + IRBuilder<>::FastMathFlagGuard Guard(B); + B.setFastMathFlags(FMF); + + // Build: + // ex = frexp_exp(ax) - 1; + // ax = fldexp(frexp_mant(ax), bits); + // ey = frexp_exp(ay) - 1; + // ay = fledxp(frexp_mant(ay), 1); + auto [Ax, Ex] = buildExpAndPower(AxInitial, Bits, "ex", "ax"); + auto [Ay, Ey] = buildExpAndPower(AyInitial, One, "ey", "ay"); + + // Build: + // int nb = ex - ey; + // float ayinv = 1.0/ay; + Value *Nb = B.CreateSub(Ex, Ey, "nb"); + Value *Ayinv = createRcp(Ay, "ayinv"); + + // Build: while (nb > bits) + BasicBlock *PreheaderBB = B.GetInsertBlock(); + Function *Fun = PreheaderBB->getParent(); + auto *LoopBB = BasicBlock::Create(B.getContext(), "frem.loop_body", Fun); + auto *ExitBB = BasicBlock::Create(B.getContext(), "frem.loop_exit", Fun); + + B.CreateCondBr(B.CreateICmp(CmpInst::ICMP_SGT, Nb, Bits), LoopBB, ExitBB); + + // Build loop body: + // UPDATE_AX + // ax = fldexp(ax, bits); + // nb -= bits; + // One iteration of the loop is factored out. The code shared by + // the loop and this "iteration" is denoted by UPDATE_AX. + B.SetInsertPoint(LoopBB); + PHINode *NbIv = B.CreatePHI(Nb->getType(), 2, "nb_iv"); + NbIv->addIncoming(Nb, PreheaderBB); + + auto *AxPhi = B.CreatePHI(ComputeFpTy, 2, "ax_loop_phi"); + AxPhi->addIncoming(Ax, PreheaderBB); + + Value *AxPhiUpdate = buildUpdateAx(AxPhi, Ay, Ayinv); + AxPhiUpdate = B.CreateLdexp(AxPhiUpdate, Bits, {}, "ax_update"); + AxPhi->addIncoming(AxPhiUpdate, LoopBB); + NbIv->addIncoming(B.CreateSub(NbIv, Bits, "nb_update"), LoopBB); + + B.CreateCondBr(B.CreateICmp(CmpInst::ICMP_SGT, NbIv, Bits), LoopBB, ExitBB); + + // Build final iteration + // ax = fldexp(ax, nb - bits + 1); + // UPDATE_AX + B.SetInsertPoint(ExitBB); + + auto *AxPhiExit = B.CreatePHI(ComputeFpTy, 2, "ax_exit_phi"); + AxPhiExit->addIncoming(Ax, PreheaderBB); + AxPhiExit->addIncoming(AxPhi, LoopBB); + auto *NbExitPhi = B.CreatePHI(Nb->getType(), 2, "nb_exit_phi"); + NbExitPhi->addIncoming(NbIv, LoopBB); + NbExitPhi->addIncoming(Nb, PreheaderBB); + + Value *AxFinal = B.CreateLdexp( + AxPhiExit, B.CreateAdd(B.CreateSub(NbExitPhi, Bits), One), {}, "ax"); + AxFinal = buildUpdateAx(AxFinal, Ay, Ayinv); + + // Build: + // ax = fldexp(ax, ey); + // ret = copysign(ax,x); + AxFinal = B.CreateLdexp(AxFinal, Ey, {}, "ax"); + if (ComputeFpTy != FremTy) + AxFinal = B.CreateFPTrunc(AxFinal, FremTy); + Value *Ret = B.CreateCopySign(AxFinal, X); + + RetPhi->addIncoming(Ret, ExitBB); + } + + /// Build the else-branch of the conditional in the FRem + /// expansion, i.e. the case in wich Ax <= Ay, where Ax = |X|, Ay + /// = |Y|, and X is the numerator and Y the denumerator. Add the + /// incoming edge from the result to \p RetPhi. + void buildElseBranch(Value *Ax, Value *Ay, Value *X, PHINode *RetPhi) const { + // Build: + // ret = ax == ay ? copysign(0.0f, x) : x; + Value *ZeroWithXSign = B.CreateCopySign(ConstantFP::getZero(FremTy), X); + Value *Ret = B.CreateSelect(B.CreateFCmpOEQ(Ax, Ay), ZeroWithXSign, X); + + RetPhi->addIncoming(Ret, B.GetInsertBlock()); + } + + /// Return a value that is NaN if one of the corner cases concerning + /// the inputs \p X and \p Y is detected, and \p Ret otherwise. + Value *handleInputCornerCases(Value *Ret, Value *X, Value *Y, + std::optional &SQ, + bool NoInfs) const { + // Build: + // ret = (y == 0.0f || isnan(y)) ? QNAN : ret; + // ret = isfinite(x) ? ret : QNAN; + Value *Nan = ConstantFP::getQNaN(FremTy); + Ret = B.CreateSelect(B.CreateFCmpUEQ(Y, ConstantFP::getZero(FremTy)), Nan, + Ret); + Value *XFinite = + NoInfs || (SQ && isKnownNeverInfinity(X, *SQ)) + ? B.getTrue() + : B.CreateFCmpULT(B.CreateUnaryIntrinsic(Intrinsic::fabs, X), + ConstantFP::getInfinity(FremTy)); + Ret = B.CreateSelect(XFinite, Ret, Nan); + + return Ret; + } +}; + +Value *FRemExpander::buildApproxFRem(Value *X, Value *Y) const { + IRBuilder<>::FastMathFlagGuard Guard(B); + // Propagating the approximate functions flag to the + // division leads to an unacceptable drop in precision + // on AMDGPU. + // TODO Find out if any flags might be worth propagating. + B.clearFastMathFlags(); + + Value *Quot = B.CreateFDiv(X, Y); + Value *Trunc = B.CreateUnaryIntrinsic(Intrinsic::trunc, Quot, {}); + Value *Neg = B.CreateFNeg(Trunc); + + return B.CreateFMA(Neg, Y, X); +} + +Value *FRemExpander::buildFRem(Value *X, Value *Y, + std::optional &SQ) const { + assert(X->getType() == FremTy && Y->getType() == FremTy); + + FastMathFlags FMF = B.getFastMathFlags(); + + // This function generates the following code structure: + // if (abs(x) > abs(y)) + // { ret = compute remainder } + // else + // { ret = x or 0 with sign of x } + // Adjust ret to NaN/inf in input + // return ret + Value *Ax = B.CreateUnaryIntrinsic(Intrinsic::fabs, X, {}, "ax"); + Value *Ay = B.CreateUnaryIntrinsic(Intrinsic::fabs, Y, {}, "ay"); + if (ComputeFpTy != X->getType()) { + Ax = B.CreateFPExt(Ax, ComputeFpTy, "ax"); + Ay = B.CreateFPExt(Ay, ComputeFpTy, "ay"); + } + Value *AxAyCmp = B.CreateFCmpOGT(Ax, Ay); + + PHINode *RetPhi = B.CreatePHI(FremTy, 2, "ret"); + Value *Ret = RetPhi; + + // We would return NaN in all corner cases handled here. + // Hence, if NaNs are excluded, keep the result as it is. + if (!FMF.noNaNs()) + Ret = handleInputCornerCases(Ret, X, Y, SQ, FMF.noInfs()); + + Function *Fun = B.GetInsertBlock()->getParent(); + auto *ThenBB = BasicBlock::Create(B.getContext(), "frem.compute", Fun); + auto *ElseBB = BasicBlock::Create(B.getContext(), "frem.else", Fun); + SplitBlockAndInsertIfThenElse(AxAyCmp, RetPhi, &ThenBB, &ElseBB); + + auto SavedInsertPt = B.GetInsertPoint(); + + // Build remainder computation for "then" branch + // + // The ordered comparison ensures that ax and ay are not NaNs + // in the then-branch. Furthermore, y cannot be an infinity and the + // check at the end of the function ensures that the result will not + // be used if x is an infinity. + FastMathFlags ComputeFMF = FMF; + ComputeFMF.setNoInfs(); + ComputeFMF.setNoNaNs(); + + B.SetInsertPoint(ThenBB); + buildRemainderComputation(Ax, Ay, X, RetPhi, FMF); + B.CreateBr(RetPhi->getParent()); + + // Build "else"-branch + B.SetInsertPoint(ElseBB); + buildElseBranch(Ax, Ay, X, RetPhi); + B.CreateBr(RetPhi->getParent()); + + B.SetInsertPoint(SavedInsertPt); + + return Ret; +} +} // namespace + +static bool expandFRem(BinaryOperator &I, std::optional &SQ) { + LLVM_DEBUG(dbgs() << "Expanding instruction: " << I << '\n'); + + Type *ReturnTy = I.getType(); + assert(FRemExpander::canExpandType(ReturnTy->getScalarType())); + + FastMathFlags FMF = I.getFastMathFlags(); + // TODO Make use of those flags for optimization? + FMF.setAllowReciprocal(false); + FMF.setAllowContract(false); + + IRBuilder<> B(&I); + B.setFastMathFlags(FMF); + B.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *ElemTy = ReturnTy->getScalarType(); + const FRemExpander Expander = FRemExpander::create(B, ElemTy); + + Value *Ret; + if (ReturnTy->isFloatingPointTy()) + Ret = FMF.approxFunc() + ? Expander.buildApproxFRem(I.getOperand(0), I.getOperand(1)) + : Expander.buildFRem(I.getOperand(0), I.getOperand(1), SQ); + else { + auto *VecTy = cast(ReturnTy); + + // This could use SplitBlockAndInsertForEachLane but the interface + // is a bit awkward for a constant number of elements and it will + // boil down to the same code. + // TODO Expand the FRem instruction only once and reuse the code. + Value *Nums = I.getOperand(0); + Value *Denums = I.getOperand(1); + Ret = PoisonValue::get(I.getType()); + for (int I = 0, E = VecTy->getNumElements(); I != E; ++I) { + Value *Num = B.CreateExtractElement(Nums, I); + Value *Denum = B.CreateExtractElement(Denums, I); + Value *Rem = FMF.approxFunc() ? Expander.buildApproxFRem(Num, Denum) + : Expander.buildFRem(Num, Denum, SQ); + Ret = B.CreateInsertElement(Ret, Rem, I); + } + } + + I.replaceAllUsesWith(Ret); + Ret->takeName(&I); + I.eraseFromParent(); + + return true; +} // clang-format off: preserve formatting of the following example /// Generate code to convert a fp number to integer, replacing FPToS(U)I with @@ -64,8 +428,8 @@ static cl::opt /// br i1 %cmp6.not, label %if.end12, label %if.then8 /// /// if.then8: ; preds = %if.end -/// %cond11 = select i1 %tobool.not, i64 9223372036854775807, i64 -9223372036854775808 -/// br label %cleanup +/// %cond11 = select i1 %tobool.not, i64 9223372036854775807, i64 +/// -9223372036854775808 br label %cleanup /// /// if.end12: ; preds = %if.end /// %cmp13 = icmp ult i64 %shr, 150 @@ -83,9 +447,10 @@ static cl::opt /// %mul19 = mul nsw i64 %shl, %conv /// br label %cleanup /// -/// cleanup: ; preds = %entry, %if.else, %if.then15, %if.then8 -/// %retval.0 = phi i64 [ %cond11, %if.then8 ], [ %mul, %if.then15 ], [ %mul19, %if.else ], [ 0, %entry ] -/// ret i64 %retval.0 +/// cleanup: ; preds = %entry, +/// %if.else, %if.then15, %if.then8 +/// %retval.0 = phi i64 [ %cond11, %if.then8 ], [ %mul, %if.then15 ], [ +/// %mul19, %if.else ], [ 0, %entry ] ret i64 %retval.0 /// } /// /// Replace fp to integer with generated code. @@ -272,13 +637,11 @@ static void expandFPToI(Instruction *FPToI) { /// %or = or i64 %shr6, %conv11 /// br label %sw.epilog /// -/// sw.epilog: ; preds = %sw.default, %if.then4, %sw.bb -/// %a.addr.0 = phi i64 [ %or, %sw.default ], [ %sub, %if.then4 ], [ %shl, %sw.bb ] -/// %1 = lshr i64 %a.addr.0, 2 -/// %2 = and i64 %1, 1 -/// %or16 = or i64 %2, %a.addr.0 -/// %inc = add nsw i64 %or16, 1 -/// %3 = and i64 %inc, 67108864 +/// sw.epilog: ; preds = %sw.default, +/// %if.then4, %sw.bb +/// %a.addr.0 = phi i64 [ %or, %sw.default ], [ %sub, %if.then4 ], [ %shl, +/// %sw.bb ] %1 = lshr i64 %a.addr.0, 2 %2 = and i64 %1, 1 %or16 = or i64 %2, +/// %a.addr.0 %inc = add nsw i64 %or16, 1 %3 = and i64 %inc, 67108864 /// %tobool.not = icmp eq i64 %3, 0 /// %spec.select.v = select i1 %tobool.not, i64 2, i64 3 /// %spec.select = ashr i64 %inc, %spec.select.v @@ -291,7 +654,8 @@ static void expandFPToI(Instruction *FPToI) { /// %shl25 = shl i64 %sub, %sh_prom24 /// br label %if.end26 /// -/// if.end26: ; preds = %sw.epilog, %if.else +/// if.end26: ; preds = %sw.epilog, +/// %if.else /// %a.addr.1 = phi i64 [ %shl25, %if.else ], [ %spec.select, %sw.epilog ] /// %e.0 = phi i32 [ %sub2, %if.else ], [ %spec.select56, %sw.epilog ] /// %conv27 = trunc i64 %shr to i32 @@ -305,7 +669,8 @@ static void expandFPToI(Instruction *FPToI) { /// %4 = bitcast i32 %or33 to float /// br label %return /// -/// return: ; preds = %entry, %if.end26 +/// return: ; preds = %entry, +/// %if.end26 /// %retval.0 = phi float [ %4, %if.end26 ], [ 0.000000e+00, %entry ] /// ret float %retval.0 /// } @@ -594,7 +959,38 @@ static void scalarize(Instruction *I, SmallVectorImpl &Replace) { I->eraseFromParent(); } -static bool runImpl(Function &F, const TargetLowering &TLI) { +// This covers all floating point types; more than we need here. +// TODO Move somewhere else for general use? +/// Return the Libcall for a frem instruction of +/// type \p Ty. +static RTLIB::Libcall fremToLibcall(Type *Ty) { + assert(Ty->isFloatingPointTy()); + if (Ty->isFloatTy() || Ty->is16bitFPTy()) + return RTLIB::REM_F32; + if (Ty->isDoubleTy()) + return RTLIB::REM_F64; + if (Ty->isFP128Ty()) + return RTLIB::REM_F128; + if (Ty->isX86_FP80Ty()) + return RTLIB::REM_F80; + if (Ty->isPPC_FP128Ty()) + return RTLIB::REM_PPCF128; + + llvm_unreachable("Unknown floating point type"); +} + +/* Return true if, according to \p LibInfo, the target either directly + supports the frem instruction for the \p Ty, has a custom lowering, + or uses a libcall. */ +static bool targetSupportsFrem(const TargetLowering &TLI, Type *Ty) { + if (!TLI.isOperationExpand(ISD::FREM, EVT::getEVT(Ty))) + return true; + + return TLI.getLibcallName(fremToLibcall(Ty->getScalarType())); +} + +static bool runImpl(Function &F, const TargetLowering &TLI, + AssumptionCache *AC) { SmallVector Replace; SmallVector ReplaceVector; bool Modified = false; @@ -609,6 +1005,21 @@ static bool runImpl(Function &F, const TargetLowering &TLI) { for (auto &I : instructions(F)) { switch (I.getOpcode()) { + case Instruction::FRem: { + Type *Ty = I.getType(); + // TODO: This pass doesn't handle scalable vectors. + if (Ty->isScalableTy()) + continue; + + if (targetSupportsFrem(TLI, Ty) || + !FRemExpander::canExpandType(Ty->getScalarType())) + continue; + + Replace.push_back(&I); + Modified = true; + + break; + } case Instruction::FPToUI: case Instruction::FPToSI: { // TODO: This pass doesn't handle scalable vectors. @@ -659,8 +1070,20 @@ static bool runImpl(Function &F, const TargetLowering &TLI) { while (!Replace.empty()) { Instruction *I = Replace.pop_back_val(); - if (I->getOpcode() == Instruction::FPToUI || - I->getOpcode() == Instruction::FPToSI) { + if (I->getOpcode() == Instruction::FRem) { + auto SQ = [&]() -> std::optional { + if (AC) { + auto Res = std::make_optional( + I->getModule()->getDataLayout(), I); + Res->AC = AC; + return Res; + } + return {}; + }(); + + expandFRem(cast(*I), SQ); + } else if (I->getOpcode() == Instruction::FPToUI || + I->getOpcode() == Instruction::FPToSI) { expandFPToI(I); } else { expandIToFP(I); @@ -672,31 +1095,61 @@ static bool runImpl(Function &F, const TargetLowering &TLI) { namespace { class ExpandFpLegacyPass : public FunctionPass { + CodeGenOptLevel OptLevel; + public: static char ID; - ExpandFpLegacyPass() : FunctionPass(ID) { + ExpandFpLegacyPass(CodeGenOptLevel OptLevel) + : FunctionPass(ID), OptLevel(OptLevel) { initializeExpandFpLegacyPassPass(*PassRegistry::getPassRegistry()); } + ExpandFpLegacyPass() : ExpandFpLegacyPass(CodeGenOptLevel::None) {}; + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + auto *TM = &getAnalysis().getTM(); auto *TLI = TM->getSubtargetImpl(F)->getTargetLowering(); - return runImpl(F, *TLI); + AssumptionCache *AC = nullptr; + + if (OptLevel != CodeGenOptLevel::None || F.hasOptNone()) + AC = &getAnalysis().getAssumptionCache(F); + return runImpl(F, *TLI, AC); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + if (OptLevel != CodeGenOptLevel::None) + AU.addRequired(); AU.addPreserved(); AU.addPreserved(); } }; } // namespace +ExpandFpPass::ExpandFpPass(const TargetMachine *TM, CodeGenOptLevel OptLevel) + : TM(TM), OptLevel(OptLevel) {} + +void ExpandFpPass::printPipeline( + raw_ostream &OS, function_ref MapClassName2PassName) { + static_cast *>(this)->printPipeline( + OS, MapClassName2PassName); + OS << '<'; + OS << "opt-level=" << (int)OptLevel; + OS << '>'; +} + PreservedAnalyses ExpandFpPass::run(Function &F, FunctionAnalysisManager &FAM) { const TargetSubtargetInfo *STI = TM->getSubtargetImpl(F); - return runImpl(F, *STI->getTargetLowering()) ? PreservedAnalyses::none() - : PreservedAnalyses::all(); + auto &TLI = *STI->getTargetLowering(); + AssumptionCache *AC = nullptr; + if (OptLevel != CodeGenOptLevel::None) + AC = &FAM.getResult(F); + return runImpl(F, TLI, AC) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); } char ExpandFpLegacyPass::ID = 0; @@ -704,4 +1157,6 @@ INITIALIZE_PASS_BEGIN(ExpandFpLegacyPass, "expand-fp", "Expand certain fp instructions", false, false) INITIALIZE_PASS_END(ExpandFpLegacyPass, "expand-fp", "Expand fp", false, false) -FunctionPass *llvm::createExpandFpPass() { return new ExpandFpLegacyPass(); } +FunctionPass *llvm::createExpandFpPass(CodeGenOptLevel OptLevel) { + return new ExpandFpLegacyPass(OptLevel); +} diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 7d7c6e743fa76..f83973c30b48a 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1074,7 +1074,7 @@ bool TargetPassConfig::addISelPasses() { PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis())); addPass(createPreISelIntrinsicLoweringPass()); addPass(createExpandLargeDivRemPass()); - addPass(createExpandFpPass()); + addPass(createExpandFpPass(getOptLevel())); addIRPasses(); addCodeGenPrepare(); addPassesToHandleExceptions(); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index d75304b5e11f6..587f0ece0859b 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -185,6 +185,7 @@ #include "llvm/IR/Verifier.h" #include "llvm/IRPrinter/IRPrintingPasses.h" #include "llvm/Passes/OptimizationLevel.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -1492,6 +1493,29 @@ parseBoundsCheckingOptions(StringRef Params) { return Options; } +Expected parseExpandFpOptions(StringRef Params) { + if (Params.empty()) + return CodeGenOptLevel::None; + + StringRef Param; + std::tie(Param, Params) = Params.split(';'); + if (!Params.empty()) + return createStringError("too many expand-fp pass parameters"); + + auto [Name, Val] = Param.split('='); + if (Name != "opt-level") + return createStringError("invalid expand-fp pass parameter '%s'", + Param.str().c_str()); + int8_t N; + Val.getAsInteger(10, N); + std::optional Level = CodeGenOpt::getLevel(N); + if (!Level.has_value()) + return createStringError("invalid expand-fp opt-level value: %s", + Val.str().c_str()); + + return *Level; +} + Expected parseRegAllocGreedyFilterFunc(PassBuilder &PB, StringRef Params) { if (Params.empty() || Params == "all") diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 4b462b9c6845c..bbb5569b28722 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -426,7 +426,6 @@ FUNCTION_PASS("dot-post-dom-only", PostDomOnlyPrinter()) FUNCTION_PASS("dse", DSEPass()) FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(TM)) FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(TM)) -FUNCTION_PASS("expand-fp", ExpandFpPass(TM)) FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM)) FUNCTION_PASS("expand-reductions", ExpandReductionsPass()) FUNCTION_PASS("extra-vector-passes", @@ -720,6 +719,13 @@ FUNCTION_PASS_WITH_PARAMS( }, parseBoundsCheckingOptions, "trap;rt;rt-abort;min-rt;min-rt-abort;merge;guard=N") + FUNCTION_PASS_WITH_PARAMS( + "expand-fp", "ExpandFpPass", + [TM = TM](CodeGenOptLevel OL) { + return ExpandFpPass(TM, OL); + }, + parseExpandFpOptions, "opt-level") + #undef FUNCTION_PASS_WITH_PARAMS #ifndef LOOPNEST_PASS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 20426d54141c4..c0ab055eba681 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -423,7 +423,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64}, Expand); - setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom); + setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand); if (Subtarget->has16BitInsts()) { setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal); @@ -1439,8 +1439,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); - case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); - case ISD::FREM: return LowerFREM(Op, DAG); + case ISD::SDIVREM: + return LowerSDIVREM(Op, DAG); case ISD::FCEIL: return LowerFCEIL(Op, DAG); case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); case ISD::FRINT: return LowerFRINT(Op, DAG); @@ -2435,21 +2435,6 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, return DAG.getMergeValues(Res, DL); } -// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x) -SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - EVT VT = Op.getValueType(); - auto Flags = Op->getFlags(); - SDValue X = Op.getOperand(0); - SDValue Y = Op.getOperand(1); - - SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags); - SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags); - SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags); - // TODO: For f32 use FMAD instead if !hasFastFMA32? - return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags); -} - SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll index 2226fd20fb774..026115c4edc0e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -7,12 +7,215 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: ; implicit-def: $vgpr1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: s_load_dword s3, s[4:5], 0x2 +; CI-NEXT: s_mov_b32 s4, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s2| +; CI-NEXT: v_cvt_f32_f16_e64 v0, |s3| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v0 +; CI-NEXT: s_cbranch_vccz .LBB0_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: s_and_b32 s4, s2, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v0 +; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: v_mov_b32_e32 v3, s2 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CI-NEXT: s_mov_b32 s4, 0 +; CI-NEXT: .LBB0_2: ; %Flow18 +; CI-NEXT: s_xor_b32 s4, s4, 1 +; CI-NEXT: s_cmp_lg_u32 s4, 0 +; CI-NEXT: s_cbranch_scc1 .LBB0_8 +; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f32_e32 v1, v2 +; CI-NEXT: v_ldexp_f32_e64 v4, v1, 11 +; CI-NEXT: v_frexp_mant_f32_e32 v1, v0 +; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1 +; CI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5 +; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; CI-NEXT: v_rcp_f32_e32 v8, v3 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; CI-NEXT: v_fma_f32 v8, v9, v8, v8 +; CI-NEXT: v_mul_f32_e32 v9, v7, v8 +; CI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; CI-NEXT: v_fma_f32 v9, v10, v8, v9 +; CI-NEXT: v_fma_f32 v3, -v3, v9, v7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 +; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB0_6 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; CI-NEXT: .LBB0_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: v_mul_f32_e32 v4, v5, v3 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v6, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2 +; CI-NEXT: v_ldexp_f32_e64 v4, v4, 11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2 +; CI-NEXT: s_cbranch_vccnz .LBB0_5 +; CI-NEXT: s_branch .LBB0_7 +; CI-NEXT: .LBB0_6: +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: .LBB0_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2 +; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 +; CI-NEXT: v_mul_f32_e32 v3, v2, v3 +; CI-NEXT: v_rndne_f32_e32 v3, v3 +; CI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; CI-NEXT: v_add_f32_e32 v1, v2, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: s_and_b32 s4, s2, 0x8000 +; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; CI-NEXT: v_or_b32_e32 v1, s4, v0 +; CI-NEXT: .LBB0_8: ; %Flow19 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, 0 +; CI-NEXT: s_and_b32 s2, s2, 0x7fff +; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00 +; CI-NEXT: s_cselect_b32 s2, 1, 0 +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 +; CI-NEXT: s_and_b32 s2, 1, s2 +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: frem_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s0, s[10:11], 0x0 +; VI-NEXT: s_load_dword s1, s[2:3], 0x8 +; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e64 v1, |s0| +; VI-NEXT: v_cvt_f32_f16_e64 v0, |s1| +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v1, v0 +; VI-NEXT: s_cbranch_vccz .LBB0_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_and_b32 s2, s0, 0x8000 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: .LBB0_2: ; %Flow18 +; VI-NEXT: s_xor_b32 s2, s2, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB0_8 +; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f32_e32 v2, v1 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v5, v1 +; VI-NEXT: v_frexp_mant_f32_e32 v1, v0 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v5 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6 +; VI-NEXT: v_ldexp_f32 v1, v1, 1 +; VI-NEXT: v_ldexp_f32 v4, v2, 11 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v3, v0 +; VI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0 +; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v8, v3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; VI-NEXT: v_fma_f32 v8, v9, v8, v8 +; VI-NEXT: v_mul_f32_e32 v9, v7, v8 +; VI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; VI-NEXT: v_fma_f32 v9, v10, v8, v9 +; VI-NEXT: v_fma_f32 v3, -v3, v9, v7 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 +; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB0_6 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: .LBB0_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: v_mul_f32_e32 v4, v5, v3 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v6, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2 +; VI-NEXT: v_ldexp_f32 v4, v4, 11 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2 +; VI-NEXT: s_cbranch_vccnz .LBB0_5 +; VI-NEXT: s_branch .LBB0_7 +; VI-NEXT: .LBB0_6: +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: .LBB0_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2 +; VI-NEXT: v_ldexp_f32 v2, v5, v2 +; VI-NEXT: v_mul_f32_e32 v3, v2, v3 +; VI-NEXT: v_rndne_f32_e32 v3, v3 +; VI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; VI-NEXT: v_add_f32_e32 v1, v2, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; VI-NEXT: v_ldexp_f32 v0, v1, v0 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: s_and_b32 s2, s0, 0x8000 +; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; VI-NEXT: v_or_b32_e32 v2, s2, v0 +; VI-NEXT: .LBB0_8: ; %Flow19 +; VI-NEXT: v_mov_b32_e32 v0, 0x7c00 +; VI-NEXT: v_cmp_nlg_f16_e64 vcc, s1, 0 +; VI-NEXT: v_cmp_nge_f16_e64 s[0:1], |s0|, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0x7e00 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc +; VI-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm + %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 + %r0 = load half, ptr addrspace(1) %in1, align 4 + %r1 = load half, ptr addrspace(1) %gep2, align 4 + %r2 = frem half %r0, %r1 + store half %r2, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { +; CI-LABEL: fast_frem_f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dword s6, s[2:3], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; CI-NEXT: v_rcp_f32_e32 v4, v2 @@ -27,15 +230,21 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; CI-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; CI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_trunc_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_fma_f32 v0, v0, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: frem_f16: +; VI-LABEL: fast_frem_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 @@ -65,33 +274,51 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 %r1 = load half, ptr addrspace(1) %gep2, align 4 - %r2 = frem half %r0, %r1 + %r2 = frem fast half %r0, %r1 store half %r2, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { -; CI-LABEL: fast_frem_f16: +define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { +; CI-LABEL: unsafe_frem_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: s_load_dword s3, s[4:5], 0x2 +; CI-NEXT: s_load_dword s6, s[2:3], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 +; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; CI-NEXT: v_rcp_f32_e32 v4, v2 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; CI-NEXT: v_fma_f32 v4, v5, v4, v4 +; CI-NEXT: v_mul_f32_e32 v5, v3, v4 +; CI-NEXT: v_fma_f32 v6, -v2, v5, v3 +; CI-NEXT: v_fma_f32 v5, v6, v4, v5 +; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_rcp_f32_e32 v2, v1 -; CI-NEXT: v_mul_f32_e32 v2, v0, v2 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; CI-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; CI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_trunc_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_fma_f32 v0, v0, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: fast_frem_f16: +; VI-LABEL: unsafe_frem_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 @@ -99,11 +326,21 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: s_load_dword s3, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_rcp_f16_e32 v0, s3 -; VI-NEXT: v_mul_f16_e32 v0, s2, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_rcp_f32_e32 v3, v2 +; VI-NEXT: v_mul_f32_e32 v4, v0, v3 +; VI-NEXT: v_mad_f32 v5, -v2, v4, v0 +; VI-NEXT: v_mac_f32_e32 v4, v5, v3 +; VI-NEXT: v_mad_f32 v0, -v2, v4, v0 +; VI-NEXT: v_mul_f32_e32 v0, v0, v3 +; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; VI-NEXT: v_add_f32_e32 v0, v0, v4 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2 ; VI-NEXT: v_trunc_f16_e32 v0, v0 -; VI-NEXT: v_fma_f16 v2, -v0, s3, v1 +; VI-NEXT: v_fma_f16 v2, -v0, v1, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -111,59 +348,209 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 %r1 = load half, ptr addrspace(1) %gep2, align 4 - %r2 = frem fast half %r0, %r1 + %r2 = frem afn half %r0, %r1 store half %r2, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { -; CI-LABEL: unsafe_frem_f16: +define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { +; CI-LABEL: frem_f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: s_load_dword s3, s[4:5], 0x2 +; CI-NEXT: s_load_dword s3, s[4:5], 0x4 +; CI-NEXT: s_mov_b32 s4, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0| +; CI-NEXT: ; implicit-def: $vgpr0 +; CI-NEXT: s_cbranch_vccz .LBB3_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: s_and_b32 s4, s2, 0x80000000 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1| +; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-NEXT: s_mov_b32 s4, 0 +; CI-NEXT: .LBB3_2: ; %Flow16 +; CI-NEXT: s_xor_b32 s4, s4, 1 +; CI-NEXT: s_cmp_lg_u32 s4, 0 +; CI-NEXT: s_cbranch_scc1 .LBB3_8 +; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f32_e64 v1, |s3| +; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1 +; CI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v0, |s2| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3| +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5 +; CI-NEXT: v_ldexp_f32_e64 v4, v0, 12 +; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; CI-NEXT: v_rcp_f32_e32 v8, v3 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; CI-NEXT: v_fma_f32 v8, v9, v8, v8 +; CI-NEXT: v_mul_f32_e32 v9, v7, v8 +; CI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; CI-NEXT: v_fma_f32 v9, v10, v8, v9 +; CI-NEXT: v_fma_f32 v3, -v3, v9, v7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 +; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB3_6 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; CI-NEXT: .LBB3_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: v_mul_f32_e32 v4, v5, v3 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v6, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, -12, v2 +; CI-NEXT: v_ldexp_f32_e64 v4, v4, 12 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2 +; CI-NEXT: s_cbranch_vccnz .LBB3_5 +; CI-NEXT: s_branch .LBB3_7 +; CI-NEXT: .LBB3_6: +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: .LBB3_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2 +; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 +; CI-NEXT: v_mul_f32_e32 v3, v2, v3 +; CI-NEXT: v_rndne_f32_e32 v3, v3 +; CI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; CI-NEXT: v_add_f32_e32 v1, v2, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; CI-NEXT: s_and_b32 s4, s2, 0x80000000 +; CI-NEXT: v_or_b32_e32 v0, s4, v0 +; CI-NEXT: .LBB3_8: ; %Flow17 +; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s3, 0 +; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; CI-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s2|, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_rcp_f32_e32 v2, v1 -; CI-NEXT: v_mul_f32_e32 v2, v0, v2 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: unsafe_frem_f16: +; VI-LABEL: frem_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: s_load_dword s3, s[4:5], 0x8 +; VI-NEXT: s_load_dword s3, s[4:5], 0x10 +; VI-NEXT: s_mov_b32 s4, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_rcp_f16_e32 v0, s3 -; VI-NEXT: v_mul_f16_e32 v0, s2, v0 -; VI-NEXT: v_trunc_f16_e32 v0, v0 -; VI-NEXT: v_fma_f16 v2, -v0, s3, v1 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0| +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: s_cbranch_vccz .LBB3_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_and_b32 s4, s2, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1| +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: .LBB3_2: ; %Flow16 +; VI-NEXT: s_xor_b32 s4, s4, 1 +; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cbranch_scc1 .LBB3_8 +; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f32_e64 v1, |s3| +; VI-NEXT: v_ldexp_f32 v1, v1, 1 +; VI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v0, |s2| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3| +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5 +; VI-NEXT: v_ldexp_f32 v4, v0, 12 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v8, v3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; VI-NEXT: v_fma_f32 v8, v9, v8, v8 +; VI-NEXT: v_mul_f32_e32 v9, v7, v8 +; VI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; VI-NEXT: v_fma_f32 v9, v10, v8, v9 +; VI-NEXT: v_fma_f32 v3, -v3, v9, v7 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 +; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB3_6 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: .LBB3_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: v_mul_f32_e32 v4, v5, v3 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v6, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, -12, v2 +; VI-NEXT: v_ldexp_f32 v4, v4, 12 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2 +; VI-NEXT: s_cbranch_vccnz .LBB3_5 +; VI-NEXT: s_branch .LBB3_7 +; VI-NEXT: .LBB3_6: +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: .LBB3_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2 +; VI-NEXT: v_ldexp_f32 v2, v5, v2 +; VI-NEXT: v_mul_f32_e32 v3, v2, v3 +; VI-NEXT: v_rndne_f32_e32 v3, v3 +; VI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; VI-NEXT: v_add_f32_e32 v1, v2, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; VI-NEXT: v_ldexp_f32 v0, v1, v0 +; VI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; VI-NEXT: s_and_b32 s4, s2, 0x80000000 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: .LBB3_8: ; %Flow17 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; VI-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s2|, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm - %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 - %r0 = load half, ptr addrspace(1) %in1, align 4 - %r1 = load half, ptr addrspace(1) %gep2, align 4 - %r2 = frem afn half %r0, %r1 - store half %r2, ptr addrspace(1) %out, align 4 + %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 + %r0 = load float, ptr addrspace(1) %in1, align 4 + %r1 = load float, ptr addrspace(1) %gep2, align 4 + %r2 = frem float %r0, %r1 + store float %r2, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { -; CI-LABEL: frem_f32: +define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { +; CI-LABEL: fast_frem_f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd @@ -192,7 +579,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: frem_f32: +; VI-LABEL: fast_frem_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 @@ -223,43 +610,65 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 %r0 = load float, ptr addrspace(1) %in1, align 4 %r1 = load float, ptr addrspace(1) %gep2, align 4 - %r2 = frem float %r0, %r1 + %r2 = frem fast float %r0, %r1 store float %r2, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { -; CI-LABEL: fast_frem_f32: +define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { +; CI-LABEL: unsafe_frem_f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: s_load_dword s3, s[4:5], 0x4 +; CI-NEXT: s_load_dword s6, s[2:3], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s2 -; CI-NEXT: v_rcp_f32_e32 v0, s3 -; CI-NEXT: v_mul_f32_e32 v0, s2, v0 -; CI-NEXT: v_trunc_f32_e32 v0, v0 -; CI-NEXT: v_fma_f32 v0, -v0, s3, v1 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s6 +; CI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 +; CI-NEXT: v_rcp_f32_e32 v3, v1 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; CI-NEXT: v_fma_f32 v3, v4, v3, v3 +; CI-NEXT: v_mul_f32_e32 v4, v2, v3 +; CI-NEXT: v_fma_f32 v5, -v1, v4, v2 +; CI-NEXT: v_fma_f32 v4, v5, v3, v4 +; CI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s6 +; CI-NEXT: v_trunc_f32_e32 v1, v1 +; CI-NEXT: v_fma_f32 v0, -v1, v0, s6 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: fast_frem_f32: +; VI-LABEL: unsafe_frem_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: s_load_dword s3, s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[2:3], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_rcp_f32_e32 v0, s3 -; VI-NEXT: v_mul_f32_e32 v0, s2, v0 -; VI-NEXT: v_trunc_f32_e32 v0, v0 -; VI-NEXT: v_fma_f32 v2, -v0, s3, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s6 +; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 +; VI-NEXT: v_rcp_f32_e32 v3, v1 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; VI-NEXT: v_fma_f32 v3, v4, v3, v3 +; VI-NEXT: v_mul_f32_e32 v4, v2, v3 +; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 +; VI-NEXT: v_fma_f32 v4, v5, v3, v4 +; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s6 +; VI-NEXT: v_trunc_f32_e32 v1, v1 +; VI-NEXT: v_fma_f32 v2, -v1, v0, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -267,57 +676,238 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 %r0 = load float, ptr addrspace(1) %in1, align 4 %r1 = load float, ptr addrspace(1) %gep2, align 4 - %r2 = frem fast float %r0, %r1 + %r2 = frem afn float %r0, %r1 store float %r2, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { -; CI-LABEL: unsafe_frem_f32: +define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { +; CI-LABEL: frem_f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s6, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: s_load_dword s3, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s2 -; CI-NEXT: v_rcp_f32_e32 v0, s3 -; CI-NEXT: v_mul_f32_e32 v0, s2, v0 -; CI-NEXT: v_trunc_f32_e32 v0, v0 -; CI-NEXT: v_fma_f32 v0, -v0, s3, v1 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]| +; CI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CI-NEXT: s_cbranch_vccz .LBB6_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]| +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_brev_b32 s7, 1 +; CI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7] +; CI-NEXT: v_mov_b32_e32 v0, s6 +; CI-NEXT: v_mov_b32_e32 v1, s7 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: .LBB6_2: ; %Flow16 +; CI-NEXT: s_xor_b32 s6, s6, 1 +; CI-NEXT: s_cmp_lg_u32 s6, 0 +; CI-NEXT: s_cbranch_scc1 .LBB6_8 +; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[2:3]| +; CI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[2:3]| +; CI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[4:5]| +; CI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26 +; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]| +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v6 +; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v7 +; CI-NEXT: v_sub_i32_e32 v9, vcc, v2, v8 +; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1 +; CI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], 1.0 +; CI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0 +; CI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3] +; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0 +; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0 +; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; CI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15] +; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13] +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9 +; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB6_6 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v6, vcc, 26, v6 +; CI-NEXT: v_sub_i32_e32 v9, vcc, v6, v7 +; CI-NEXT: .LBB6_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v7, v5 +; CI-NEXT: v_mov_b32_e32 v6, v4 +; CI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3] +; CI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5] +; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; CI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1] +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; CI-NEXT: v_add_i32_e32 v9, vcc, 0xffffffe6, v9 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9 +; CI-NEXT: s_cbranch_vccnz .LBB6_5 +; CI-NEXT: s_branch .LBB6_7 +; CI-NEXT: .LBB6_6: +; CI-NEXT: v_mov_b32_e32 v7, v5 +; CI-NEXT: v_mov_b32_e32 v6, v4 +; CI-NEXT: .LBB6_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe7, v9 +; CI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_brev_b32 s7, 1 +; CI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7] +; CI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3] +; CI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] +; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3] +; CI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 +; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; CI-NEXT: v_or_b32_e32 v0, s6, v0 +; CI-NEXT: v_or_b32_e32 v1, s7, v1 +; CI-NEXT: .LBB6_8: ; %Flow17 +; CI-NEXT: v_cmp_nlg_f64_e64 vcc, s[4:5], 0 +; CI-NEXT: v_mov_b32_e32 v2, 0x7ff80000 +; CI-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc +; CI-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: v_mov_b32_e32 v1, 0x7ff00000 +; CI-NEXT: v_cmp_nge_f64_e64 vcc, |s[2:3]|, v[0:1] ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: unsafe_frem_f32: +; VI-LABEL: frem_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s6, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: s_load_dword s3, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_rcp_f32_e32 v0, s3 -; VI-NEXT: v_mul_f32_e32 v0, s2, v0 -; VI-NEXT: v_trunc_f32_e32 v0, v0 -; VI-NEXT: v_fma_f32 v2, -v0, s3, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]| +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: s_cbranch_vccz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]| +; VI-NEXT: s_mov_b32 s6, 0 +; VI-NEXT: s_brev_b32 s7, 1 +; VI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7] +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: s_mov_b32 s6, 0 +; VI-NEXT: .LBB6_2: ; %Flow16 +; VI-NEXT: s_xor_b32 s6, s6, 1 +; VI-NEXT: s_cmp_lg_u32 s6, 0 +; VI-NEXT: s_cbranch_scc1 .LBB6_8 +; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[2:3]| +; VI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[2:3]| +; VI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[4:5]| +; VI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26 +; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]| +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v6 +; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v7 +; VI-NEXT: v_sub_u32_e32 v9, vcc, v2, v8 +; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1 +; VI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], 1.0 +; VI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0 +; VI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3] +; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0 +; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0 +; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; VI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15] +; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13] +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9 +; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB6_6 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v6, vcc, 26, v6 +; VI-NEXT: v_sub_u32_e32 v9, vcc, v6, v7 +; VI-NEXT: .LBB6_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3] +; VI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5] +; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; VI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1] +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0xffffffe6, v9 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9 +; VI-NEXT: s_cbranch_vccnz .LBB6_5 +; VI-NEXT: s_branch .LBB6_7 +; VI-NEXT: .LBB6_6: +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: .LBB6_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe7, v9 +; VI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4 +; VI-NEXT: s_mov_b32 s6, 0 +; VI-NEXT: s_brev_b32 s7, 1 +; VI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7] +; VI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3] +; VI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] +; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 +; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; VI-NEXT: v_or_b32_e32 v0, s6, v0 +; VI-NEXT: v_or_b32_e32 v1, s7, v1 +; VI-NEXT: .LBB6_8: ; %Flow17 +; VI-NEXT: v_cmp_nlg_f64_e64 vcc, s[4:5], 0 +; VI-NEXT: v_mov_b32_e32 v2, 0x7ff80000 +; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0x7ff00000 +; VI-NEXT: v_cmp_nge_f64_e64 vcc, |s[2:3]|, v[0:1] +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm - %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 - %r0 = load float, ptr addrspace(1) %in1, align 4 - %r1 = load float, ptr addrspace(1) %gep2, align 4 - %r2 = frem afn float %r0, %r1 - store float %r2, ptr addrspace(1) %out, align 4 + %r0 = load double, ptr addrspace(1) %in1, align 8 + %r1 = load double, ptr addrspace(1) %in2, align 8 + %r2 = frem double %r0, %r1 + store double %r2, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { -; CI-LABEL: frem_f64: +define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { +; CI-LABEL: fast_frem_f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd @@ -345,7 +935,7 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: frem_f64: +; VI-LABEL: fast_frem_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 @@ -371,63 +961,6 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm - %r0 = load double, ptr addrspace(1) %in1, align 8 - %r1 = load double, ptr addrspace(1) %in2, align 8 - %r2 = frem double %r0, %r1 - store double %r2, ptr addrspace(1) %out, align 8 - ret void -} - -define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { -; CI-LABEL: fast_frem_f64: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5] -; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 -; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 -; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1] -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3] -; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] -; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] -; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3] -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; CI-NEXT: s_endpgm -; -; VI-LABEL: fast_frem_f64: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5] -; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 -; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 -; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1] -; VI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3] -; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] -; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] -; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %r0 = load double, ptr addrspace(1) %in1, align 8 %r1 = load double, ptr addrspace(1) %in2, align 8 @@ -445,20 +978,23 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5] -; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 -; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 -; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[2:3] +; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3] +; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3] +; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] +; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3] ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3] -; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] -; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] -; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3] ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; @@ -470,18 +1006,21 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5] -; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 -; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 -; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1] -; VI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3] -; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] -; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] -; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[2:3] +; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3] +; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3] +; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] +; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -497,102 +1036,372 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xd +; CI-NEXT: ; implicit-def: $vgpr0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: s_load_dword s3, s[4:5], 0x4 +; CI-NEXT: s_load_dword s0, s[10:11], 0x0 +; CI-NEXT: s_load_dword s1, s[2:3], 0x4 +; CI-NEXT: s_mov_b32 s2, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; CI-NEXT: s_lshr_b32 s4, s2, 16 -; CI-NEXT: s_lshr_b32 s5, s3, 16 -; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 -; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; CI-NEXT: v_rcp_f32_e32 v4, v2 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s0| +; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1 +; CI-NEXT: s_cbranch_vccz .LBB9_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: s_and_b32 s2, s0, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: .LBB9_2: ; %Flow57 +; CI-NEXT: s_xor_b32 s2, s2, 1 +; CI-NEXT: s_cmp_lg_u32 s2, 0 +; CI-NEXT: s_cbranch_scc1 .LBB9_8 +; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f32_e32 v3, v1 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1 +; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1 +; CI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0 +; CI-NEXT: v_frexp_mant_f32_e32 v0, v2 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2 +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5 +; CI-NEXT: v_ldexp_f32_e64 v4, v0, 11 +; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; CI-NEXT: v_rcp_f32_e32 v8, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; CI-NEXT: v_fma_f32 v4, v5, v4, v4 -; CI-NEXT: v_mul_f32_e32 v5, v3, v4 -; CI-NEXT: v_fma_f32 v6, -v2, v5, v3 -; CI-NEXT: v_fma_f32 v5, v6, v4, v5 -; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 +; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; CI-NEXT: v_fma_f32 v8, v9, v8, v8 +; CI-NEXT: v_mul_f32_e32 v9, v7, v8 +; CI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; CI-NEXT: v_fma_f32 v9, v10, v8, v9 +; CI-NEXT: v_fma_f32 v3, -v3, v9, v7 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 +; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB9_6 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; CI-NEXT: .LBB9_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: v_mul_f32_e32 v4, v5, v3 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v6, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2 +; CI-NEXT: v_ldexp_f32_e64 v4, v4, 11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2 +; CI-NEXT: s_cbranch_vccnz .LBB9_5 +; CI-NEXT: s_branch .LBB9_7 +; CI-NEXT: .LBB9_6: +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: .LBB9_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2 +; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 +; CI-NEXT: v_mul_f32_e32 v3, v2, v3 +; CI-NEXT: v_rndne_f32_e32 v3, v3 +; CI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; CI-NEXT: v_add_f32_e32 v1, v2, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, v1 -; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1 -; CI-NEXT: v_rcp_f32_e32 v5, v3 +; CI-NEXT: s_and_b32 s2, s0, 0x8000 +; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; CI-NEXT: v_or_b32_e32 v0, s2, v0 +; CI-NEXT: .LBB9_8: ; %Flow58 +; CI-NEXT: s_lshr_b32 s2, s0, 16 +; CI-NEXT: s_lshr_b32 s3, s1, 16 +; CI-NEXT: v_cvt_f32_f16_e64 v3, |s2| +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s3| +; CI-NEXT: s_mov_b32 s4, 1 +; CI-NEXT: ; implicit-def: $vgpr1 +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; CI-NEXT: s_cbranch_vccz .LBB9_10 +; CI-NEXT: ; %bb.9: ; %frem.else20 +; CI-NEXT: s_and_b32 s4, s2, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 +; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CI-NEXT: s_mov_b32 s4, 0 +; CI-NEXT: .LBB9_10: ; %Flow53 +; CI-NEXT: s_xor_b32 s4, s4, 1 +; CI-NEXT: s_cmp_lg_u32 s4, 0 +; CI-NEXT: s_cbranch_scc1 .LBB9_16 +; CI-NEXT: ; %bb.11: ; %frem.compute19 +; CI-NEXT: v_frexp_mant_f32_e32 v4, v2 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 +; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1 +; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, 1.0 +; CI-NEXT: v_frexp_mant_f32_e32 v1, v3 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3 +; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6 +; CI-NEXT: v_ldexp_f32_e64 v5, v1, 11 +; CI-NEXT: v_add_i32_e32 v1, vcc, -1, v7 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v1 +; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 +; CI-NEXT: v_rcp_f32_e32 v9, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; CI-NEXT: v_fma_f32 v5, v6, v5, v5 -; CI-NEXT: v_mul_f32_e32 v6, v4, v5 -; CI-NEXT: v_fma_f32 v7, -v3, v6, v4 -; CI-NEXT: v_fma_f32 v6, v7, v5, v6 -; CI-NEXT: v_fma_f32 v3, -v3, v6, v4 +; CI-NEXT: v_fma_f32 v10, -v4, v9, 1.0 +; CI-NEXT: v_fma_f32 v9, v10, v9, v9 +; CI-NEXT: v_mul_f32_e32 v10, v8, v9 +; CI-NEXT: v_fma_f32 v11, -v4, v10, v8 +; CI-NEXT: v_fma_f32 v10, v11, v9, v10 +; CI-NEXT: v_fma_f32 v4, -v4, v10, v8 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1 -; CI-NEXT: v_trunc_f32_e32 v3, v3 -; CI-NEXT: v_fma_f32 v1, -v3, v2, v1 +; CI-NEXT: v_div_fmas_f32 v4, v4, v9, v10 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3 +; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB9_14 +; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; CI-NEXT: .LBB9_13: ; %frem.loop_body27 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v6, v5 +; CI-NEXT: v_mul_f32_e32 v5, v6, v4 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v5, -v5, v2, v6 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v7, v5, v2 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3 +; CI-NEXT: v_ldexp_f32_e64 v5, v5, 11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3 +; CI-NEXT: s_cbranch_vccnz .LBB9_13 +; CI-NEXT: s_branch .LBB9_15 +; CI-NEXT: .LBB9_14: +; CI-NEXT: v_mov_b32_e32 v6, v5 +; CI-NEXT: .LBB9_15: ; %frem.loop_exit28 +; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3 +; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3 +; CI-NEXT: v_mul_f32_e32 v4, v3, v4 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; CI-NEXT: v_add_f32_e32 v2, v3, v2 +; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: s_and_b32 s4, s2, 0x8000 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; CI-NEXT: v_or_b32_e32 v1, s4, v1 +; CI-NEXT: .LBB9_16: ; %Flow54 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, 0 +; CI-NEXT: s_and_b32 s0, s0, 0x7fff +; CI-NEXT: s_cmpk_lg_i32 s0, 0x7c00 +; CI-NEXT: s_cselect_b32 s4, 1, 0 +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; CI-NEXT: s_and_b32 s2, s2, 0x7fff +; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00 +; CI-NEXT: s_cselect_b32 s2, 1, 0 +; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v2, v3 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_mov_b32_e32 v2, 0x7e00 +; CI-NEXT: s_and_b32 s3, 1, s4 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s3 +; CI-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; CI-NEXT: s_and_b32 s0, 1, s2 +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 +; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: s_load_dword s3, s[4:5], 0x10 +; VI-NEXT: s_load_dword s0, s[10:11], 0x0 +; VI-NEXT: s_load_dword s1, s[2:3], 0x10 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; VI-NEXT: s_lshr_b32 s5, s3, 16 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: v_rcp_f32_e32 v3, v2 -; VI-NEXT: v_mul_f32_e32 v4, v0, v3 -; VI-NEXT: v_mad_f32 v5, -v2, v4, v0 -; VI-NEXT: v_mac_f32_e32 v4, v5, v3 -; VI-NEXT: v_mad_f32 v0, -v2, v4, v0 -; VI-NEXT: v_mul_f32_e32 v0, v0, v3 -; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0 -; VI-NEXT: v_add_f32_e32 v0, v0, v4 +; VI-NEXT: v_cvt_f32_f16_e64 v2, |s0| +; VI-NEXT: v_cvt_f32_f16_e64 v1, |s1| +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1 +; VI-NEXT: s_cbranch_vccz .LBB9_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_and_b32 s2, s0, 0x8000 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: .LBB9_2: ; %Flow57 +; VI-NEXT: s_xor_b32 s2, s2, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB9_8 +; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f32_e32 v3, v1 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1 +; VI-NEXT: v_ldexp_f32 v1, v3, 1 +; VI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0 +; VI-NEXT: v_frexp_mant_f32_e32 v0, v2 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5 +; VI-NEXT: v_ldexp_f32 v4, v0, 11 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v8, v3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; VI-NEXT: v_fma_f32 v8, v9, v8, v8 +; VI-NEXT: v_mul_f32_e32 v9, v7, v8 +; VI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; VI-NEXT: v_fma_f32 v9, v10, v8, v9 +; VI-NEXT: v_fma_f32 v3, -v3, v9, v7 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 +; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB9_6 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: .LBB9_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: v_mul_f32_e32 v4, v5, v3 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v6, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2 +; VI-NEXT: v_ldexp_f32 v4, v4, 11 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2 +; VI-NEXT: s_cbranch_vccnz .LBB9_5 +; VI-NEXT: s_branch .LBB9_7 +; VI-NEXT: .LBB9_6: +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: .LBB9_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2 +; VI-NEXT: v_ldexp_f32 v2, v5, v2 +; VI-NEXT: v_mul_f32_e32 v3, v2, v3 +; VI-NEXT: v_rndne_f32_e32 v3, v3 +; VI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; VI-NEXT: v_add_f32_e32 v1, v2, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; VI-NEXT: v_ldexp_f32 v0, v1, v0 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2 -; VI-NEXT: v_trunc_f16_e32 v0, v0 -; VI-NEXT: v_fma_f16 v0, -v0, v1, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; VI-NEXT: v_rcp_f32_e32 v4, v3 -; VI-NEXT: v_mul_f32_e32 v5, v1, v4 -; VI-NEXT: v_mad_f32 v6, -v3, v5, v1 -; VI-NEXT: v_mac_f32_e32 v5, v6, v4 -; VI-NEXT: v_mad_f32 v1, -v3, v5, v1 -; VI-NEXT: v_mul_f32_e32 v1, v1, v4 -; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; VI-NEXT: v_add_f32_e32 v1, v1, v5 +; VI-NEXT: s_and_b32 s2, s0, 0x8000 +; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; VI-NEXT: v_or_b32_e32 v0, s2, v0 +; VI-NEXT: .LBB9_8: ; %Flow58 +; VI-NEXT: s_lshr_b32 s4, s0, 16 +; VI-NEXT: s_lshr_b32 s2, s1, 16 +; VI-NEXT: v_cvt_f32_f16_e64 v3, |s4| +; VI-NEXT: v_cvt_f32_f16_e64 v2, |s2| +; VI-NEXT: s_mov_b32 s3, 1 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; VI-NEXT: s_cbranch_vccz .LBB9_10 +; VI-NEXT: ; %bb.9: ; %frem.else20 +; VI-NEXT: s_and_b32 s3, s4, 0x8000 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: .LBB9_10: ; %Flow53 +; VI-NEXT: s_xor_b32 s3, s3, 1 +; VI-NEXT: s_cmp_lg_u32 s3, 0 +; VI-NEXT: s_cbranch_scc1 .LBB9_16 +; VI-NEXT: ; %bb.11: ; %frem.compute19 +; VI-NEXT: v_frexp_mant_f32_e32 v4, v2 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 +; VI-NEXT: v_ldexp_f32 v2, v4, 1 +; VI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0 +; VI-NEXT: v_frexp_mant_f32_e32 v1, v3 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v6 +; VI-NEXT: v_ldexp_f32 v5, v1, 11 +; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v7 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 +; VI-NEXT: v_rcp_f32_e32 v9, v4 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v10, -v4, v9, 1.0 +; VI-NEXT: v_fma_f32 v9, v10, v9, v9 +; VI-NEXT: v_mul_f32_e32 v10, v8, v9 +; VI-NEXT: v_fma_f32 v11, -v4, v10, v8 +; VI-NEXT: v_fma_f32 v10, v11, v9, v10 +; VI-NEXT: v_fma_f32 v4, -v4, v10, v8 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v4, v4, v9, v10 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3 +; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB9_14 +; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v6 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 +; VI-NEXT: .LBB9_13: ; %frem.loop_body27 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v6, v5 +; VI-NEXT: v_mul_f32_e32 v5, v6, v4 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v5, -v5, v2, v6 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; VI-NEXT: v_add_f32_e32 v7, v5, v2 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3 +; VI-NEXT: v_ldexp_f32 v5, v5, 11 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3 +; VI-NEXT: s_cbranch_vccnz .LBB9_13 +; VI-NEXT: s_branch .LBB9_15 +; VI-NEXT: .LBB9_14: +; VI-NEXT: v_mov_b32_e32 v6, v5 +; VI-NEXT: .LBB9_15: ; %frem.loop_exit28 +; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3 +; VI-NEXT: v_ldexp_f32 v3, v6, v3 +; VI-NEXT: v_mul_f32_e32 v4, v3, v4 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; VI-NEXT: v_add_f32_e32 v2, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; VI-NEXT: v_ldexp_f32 v1, v2, v1 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s4 -; VI-NEXT: v_trunc_f16_e32 v1, v1 -; VI-NEXT: v_fma_f16 v1, -v1, v2, s4 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s3, s4, 0x8000 +; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v1, s3, v1 +; VI-NEXT: .LBB9_16: ; %Flow54 +; VI-NEXT: v_mov_b32_e32 v2, 0x7c00 +; VI-NEXT: v_cmp_nlg_f16_e64 vcc, s1, 0 +; VI-NEXT: v_cmp_nge_f16_e64 s[0:1], |s0|, v2 +; VI-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, 0 +; VI-NEXT: v_cmp_nge_f16_e64 s[4:5], |s4|, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_mov_b64 vcc, s[2:3] +; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_mov_b64 vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4 @@ -606,176 +1415,714 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; CI-NEXT: ; implicit-def: $vgpr0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; CI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 +; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; CI-NEXT: s_mov_b32 s0, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; CI-NEXT: s_lshr_b32 s8, s2, 16 -; CI-NEXT: s_lshr_b32 s9, s3, 16 -; CI-NEXT: s_lshr_b32 s10, s4, 16 -; CI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, v0 -; CI-NEXT: s_lshr_b32 s11, s5, 16 -; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; CI-NEXT: v_rcp_f32_e32 v4, v2 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s4| +; CI-NEXT: v_cvt_f32_f16_e64 v1, |s2| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1 +; CI-NEXT: s_cbranch_vccz .LBB10_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: s_and_b32 s0, s4, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v3, s4 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; CI-NEXT: s_mov_b32 s0, 0 +; CI-NEXT: .LBB10_2: ; %Flow135 +; CI-NEXT: s_xor_b32 s0, s0, 1 +; CI-NEXT: s_cmp_lg_u32 s0, 0 +; CI-NEXT: s_cbranch_scc1 .LBB10_8 +; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f32_e32 v3, v1 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1 +; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1 +; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, 1.0 +; CI-NEXT: v_frexp_mant_f32_e32 v0, v2 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2 +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5 +; CI-NEXT: v_ldexp_f32_e64 v4, v0, 11 +; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; CI-NEXT: v_rcp_f32_e32 v8, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; CI-NEXT: v_fma_f32 v4, v5, v4, v4 -; CI-NEXT: v_mul_f32_e32 v5, v3, v4 -; CI-NEXT: v_fma_f32 v6, -v2, v5, v3 -; CI-NEXT: v_fma_f32 v5, v6, v4, v5 -; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 +; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; CI-NEXT: v_fma_f32 v8, v9, v8, v8 +; CI-NEXT: v_mul_f32_e32 v9, v7, v8 +; CI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; CI-NEXT: v_fma_f32 v9, v10, v8, v9 +; CI-NEXT: v_fma_f32 v3, -v3, v9, v7 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s10 -; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 +; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_6 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; CI-NEXT: .LBB10_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: v_mul_f32_e32 v4, v5, v3 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v6, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2 +; CI-NEXT: v_ldexp_f32_e64 v4, v4, 11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2 +; CI-NEXT: s_cbranch_vccnz .LBB10_5 +; CI-NEXT: s_branch .LBB10_7 +; CI-NEXT: .LBB10_6: +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: .LBB10_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2 +; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 +; CI-NEXT: v_mul_f32_e32 v3, v2, v3 +; CI-NEXT: v_rndne_f32_e32 v3, v3 +; CI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; CI-NEXT: v_add_f32_e32 v1, v2, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_div_scale_f32 v3, s[6:7], v2, v2, v1 -; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1 -; CI-NEXT: v_rcp_f32_e32 v5, v3 +; CI-NEXT: s_and_b32 s0, s4, 0x8000 +; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; CI-NEXT: v_or_b32_e32 v0, s0, v0 +; CI-NEXT: .LBB10_8: ; %Flow136 +; CI-NEXT: s_lshr_b32 s6, s4, 16 +; CI-NEXT: s_lshr_b32 s0, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e64 v3, |s6| +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s0| +; CI-NEXT: s_mov_b32 s1, 1 +; CI-NEXT: ; implicit-def: $vgpr1 +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; CI-NEXT: s_cbranch_vccz .LBB10_10 +; CI-NEXT: ; %bb.9: ; %frem.else20 +; CI-NEXT: s_and_b32 s1, s6, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v4, s6 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CI-NEXT: s_mov_b32 s1, 0 +; CI-NEXT: .LBB10_10: ; %Flow131 +; CI-NEXT: s_xor_b32 s1, s1, 1 +; CI-NEXT: s_cmp_lg_u32 s1, 0 +; CI-NEXT: s_cbranch_scc1 .LBB10_16 +; CI-NEXT: ; %bb.11: ; %frem.compute19 +; CI-NEXT: v_frexp_mant_f32_e32 v4, v2 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 +; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1 +; CI-NEXT: v_div_scale_f32 v4, s[10:11], v2, v2, 1.0 +; CI-NEXT: v_frexp_mant_f32_e32 v1, v3 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3 +; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6 +; CI-NEXT: v_ldexp_f32_e64 v5, v1, 11 +; CI-NEXT: v_add_i32_e32 v1, vcc, -1, v7 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v1 +; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 +; CI-NEXT: v_rcp_f32_e32 v9, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; CI-NEXT: v_fma_f32 v5, v6, v5, v5 -; CI-NEXT: v_mul_f32_e32 v6, v4, v5 -; CI-NEXT: v_fma_f32 v7, -v3, v6, v4 -; CI-NEXT: v_fma_f32 v6, v7, v5, v6 -; CI-NEXT: v_fma_f32 v3, -v3, v6, v4 +; CI-NEXT: v_fma_f32 v10, -v4, v9, 1.0 +; CI-NEXT: v_fma_f32 v9, v10, v9, v9 +; CI-NEXT: v_mul_f32_e32 v10, v8, v9 +; CI-NEXT: v_fma_f32 v11, -v4, v10, v8 +; CI-NEXT: v_fma_f32 v10, v11, v9, v10 +; CI-NEXT: v_fma_f32 v4, -v4, v10, v8 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1 -; CI-NEXT: v_trunc_f32_e32 v3, v3 -; CI-NEXT: v_fma_f32 v1, -v3, v2, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; CI-NEXT: v_div_fmas_f32 v4, v4, v9, v10 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3 +; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_14 +; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; CI-NEXT: .LBB10_13: ; %frem.loop_body27 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v6, v5 +; CI-NEXT: v_mul_f32_e32 v5, v6, v4 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v5, -v5, v2, v6 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v7, v5, v2 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3 +; CI-NEXT: v_ldexp_f32_e64 v5, v5, 11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3 +; CI-NEXT: s_cbranch_vccnz .LBB10_13 +; CI-NEXT: s_branch .LBB10_15 +; CI-NEXT: .LBB10_14: +; CI-NEXT: v_mov_b32_e32 v6, v5 +; CI-NEXT: .LBB10_15: ; %frem.loop_exit28 +; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3 +; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3 +; CI-NEXT: v_mul_f32_e32 v4, v3, v4 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; CI-NEXT: v_add_f32_e32 v2, v3, v2 +; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, v2 -; CI-NEXT: v_div_scale_f32 v5, vcc, v2, v3, v2 -; CI-NEXT: v_rcp_f32_e32 v6, v4 +; CI-NEXT: s_and_b32 s1, s6, 0x8000 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; CI-NEXT: v_or_b32_e32 v1, s1, v1 +; CI-NEXT: .LBB10_16: ; %Flow132 +; CI-NEXT: v_cvt_f32_f16_e64 v4, |s5| +; CI-NEXT: v_cvt_f32_f16_e64 v3, |s3| +; CI-NEXT: s_mov_b32 s1, 1 +; CI-NEXT: ; implicit-def: $vgpr2 +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 +; CI-NEXT: s_cbranch_vccz .LBB10_18 +; CI-NEXT: ; %bb.17: ; %frem.else53 +; CI-NEXT: s_and_b32 s1, s5, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; CI-NEXT: s_mov_b32 s1, 0 +; CI-NEXT: .LBB10_18: ; %Flow127 +; CI-NEXT: s_xor_b32 s1, s1, 1 +; CI-NEXT: s_cmp_lg_u32 s1, 0 +; CI-NEXT: s_cbranch_scc1 .LBB10_24 +; CI-NEXT: ; %bb.19: ; %frem.compute52 +; CI-NEXT: v_frexp_mant_f32_e32 v5, v3 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3 +; CI-NEXT: v_ldexp_f32_e64 v3, v5, 1 +; CI-NEXT: v_div_scale_f32 v5, s[10:11], v3, v3, 1.0 +; CI-NEXT: v_frexp_mant_f32_e32 v2, v4 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 +; CI-NEXT: v_add_i32_e32 v4, vcc, -1, v7 +; CI-NEXT: v_ldexp_f32_e64 v6, v2, 11 +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v8 +; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 +; CI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v3, 1.0 +; CI-NEXT: v_rcp_f32_e32 v10, v5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v7, -v4, v6, 1.0 -; CI-NEXT: v_fma_f32 v6, v7, v6, v6 -; CI-NEXT: v_mul_f32_e32 v7, v5, v6 -; CI-NEXT: v_fma_f32 v8, -v4, v7, v5 -; CI-NEXT: v_fma_f32 v7, v8, v6, v7 -; CI-NEXT: v_fma_f32 v4, -v4, v7, v5 +; CI-NEXT: v_fma_f32 v11, -v5, v10, 1.0 +; CI-NEXT: v_fma_f32 v10, v11, v10, v10 +; CI-NEXT: v_mul_f32_e32 v11, v9, v10 +; CI-NEXT: v_fma_f32 v12, -v5, v11, v9 +; CI-NEXT: v_fma_f32 v11, v12, v10, v11 +; CI-NEXT: v_fma_f32 v5, -v5, v11, v9 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 -; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v2 -; CI-NEXT: v_trunc_f32_e32 v4, v4 -; CI-NEXT: v_fma_f32 v2, -v4, v3, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; CI-NEXT: v_div_fmas_f32 v5, v5, v10, v11 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v4 +; CI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_22 +; CI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; CI-NEXT: v_add_i32_e32 v4, vcc, 11, v7 +; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 +; CI-NEXT: .LBB10_21: ; %frem.loop_body60 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v7, v6 +; CI-NEXT: v_mul_f32_e32 v6, v7, v5 +; CI-NEXT: v_rndne_f32_e32 v6, v6 +; CI-NEXT: v_fma_f32 v6, -v6, v3, v7 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; CI-NEXT: v_add_f32_e32 v8, v6, v3 +; CI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4 +; CI-NEXT: v_ldexp_f32_e64 v6, v6, 11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v4 +; CI-NEXT: s_cbranch_vccnz .LBB10_21 +; CI-NEXT: s_branch .LBB10_23 +; CI-NEXT: .LBB10_22: +; CI-NEXT: v_mov_b32_e32 v7, v6 +; CI-NEXT: .LBB10_23: ; %frem.loop_exit61 +; CI-NEXT: v_add_i32_e32 v4, vcc, -10, v4 +; CI-NEXT: v_ldexp_f32_e32 v4, v7, v4 +; CI-NEXT: v_mul_f32_e32 v5, v4, v5 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v4, -v5, v3, v4 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v3, v4, v3 +; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; CI-NEXT: v_ldexp_f32_e32 v2, v3, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_div_scale_f32 v5, s[2:3], v4, v4, v3 -; CI-NEXT: v_div_scale_f32 v6, vcc, v3, v4, v3 -; CI-NEXT: v_rcp_f32_e32 v7, v5 +; CI-NEXT: s_and_b32 s1, s5, 0x8000 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; CI-NEXT: v_or_b32_e32 v2, s1, v2 +; CI-NEXT: .LBB10_24: ; %Flow128 +; CI-NEXT: s_lshr_b32 s7, s5, 16 +; CI-NEXT: s_lshr_b32 s10, s3, 16 +; CI-NEXT: v_cvt_f32_f16_e64 v5, |s7| +; CI-NEXT: v_cvt_f32_f16_e64 v4, |s10| +; CI-NEXT: s_mov_b32 s1, 1 +; CI-NEXT: ; implicit-def: $vgpr3 +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4 +; CI-NEXT: s_cbranch_vccz .LBB10_26 +; CI-NEXT: ; %bb.25: ; %frem.else86 +; CI-NEXT: s_and_b32 s1, s7, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v6, s7 +; CI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; CI-NEXT: s_mov_b32 s1, 0 +; CI-NEXT: .LBB10_26: ; %Flow123 +; CI-NEXT: s_xor_b32 s1, s1, 1 +; CI-NEXT: s_cmp_lg_u32 s1, 0 +; CI-NEXT: s_cbranch_scc1 .LBB10_32 +; CI-NEXT: ; %bb.27: ; %frem.compute85 +; CI-NEXT: v_frexp_mant_f32_e32 v6, v4 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v9, v4 +; CI-NEXT: v_ldexp_f32_e64 v4, v6, 1 +; CI-NEXT: v_div_scale_f32 v6, s[12:13], v4, v4, 1.0 +; CI-NEXT: v_frexp_mant_f32_e32 v3, v5 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v5 +; CI-NEXT: v_add_i32_e32 v5, vcc, -1, v8 +; CI-NEXT: v_ldexp_f32_e64 v7, v3, 11 +; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v9 +; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v3 +; CI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0 +; CI-NEXT: v_rcp_f32_e32 v11, v6 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 -; CI-NEXT: v_fma_f32 v7, v8, v7, v7 -; CI-NEXT: v_mul_f32_e32 v8, v6, v7 -; CI-NEXT: v_fma_f32 v9, -v5, v8, v6 -; CI-NEXT: v_fma_f32 v8, v9, v7, v8 -; CI-NEXT: v_fma_f32 v5, -v5, v8, v6 +; CI-NEXT: v_fma_f32 v12, -v6, v11, 1.0 +; CI-NEXT: v_fma_f32 v11, v12, v11, v11 +; CI-NEXT: v_mul_f32_e32 v12, v10, v11 +; CI-NEXT: v_fma_f32 v13, -v6, v12, v10 +; CI-NEXT: v_fma_f32 v12, v13, v11, v12 +; CI-NEXT: v_fma_f32 v6, -v6, v12, v10 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 +; CI-NEXT: v_div_fmas_f32 v6, v6, v11, v12 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v5 +; CI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_30 +; CI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; CI-NEXT: v_add_i32_e32 v5, vcc, 11, v8 +; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; CI-NEXT: .LBB10_29: ; %frem.loop_body93 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v8, v7 +; CI-NEXT: v_mul_f32_e32 v7, v8, v6 +; CI-NEXT: v_rndne_f32_e32 v7, v7 +; CI-NEXT: v_fma_f32 v7, -v7, v4, v8 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; CI-NEXT: v_add_f32_e32 v9, v7, v4 +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; CI-NEXT: v_add_i32_e32 v5, vcc, -11, v5 +; CI-NEXT: v_ldexp_f32_e64 v7, v7, 11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v5 +; CI-NEXT: s_cbranch_vccnz .LBB10_29 +; CI-NEXT: s_branch .LBB10_31 +; CI-NEXT: .LBB10_30: +; CI-NEXT: v_mov_b32_e32 v8, v7 +; CI-NEXT: .LBB10_31: ; %frem.loop_exit94 +; CI-NEXT: v_add_i32_e32 v5, vcc, -10, v5 +; CI-NEXT: v_ldexp_f32_e32 v5, v8, v5 +; CI-NEXT: v_mul_f32_e32 v6, v5, v6 +; CI-NEXT: v_rndne_f32_e32 v6, v6 +; CI-NEXT: v_fma_f32 v5, -v6, v4, v5 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v4, v5, v4 +; CI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; CI-NEXT: v_ldexp_f32_e32 v3, v4, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: s_and_b32 s1, s7, 0x8000 +; CI-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; CI-NEXT: v_or_b32_e32 v3, s1, v3 +; CI-NEXT: .LBB10_32: ; %Flow124 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 +; CI-NEXT: v_cvt_f32_f16_e32 v5, 0 +; CI-NEXT: s_and_b32 s1, s4, 0x7fff +; CI-NEXT: s_cmpk_lg_i32 s1, 0x7c00 +; CI-NEXT: s_cselect_b32 s11, 1, 0 +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v4, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s0 +; CI-NEXT: s_and_b32 s2, s6, 0x7fff +; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00 +; CI-NEXT: s_cselect_b32 s6, 1, 0 +; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v4, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s3 +; CI-NEXT: s_and_b32 s4, s5, 0x7fff +; CI-NEXT: s_cmpk_lg_i32 s4, 0x7c00 +; CI-NEXT: s_cselect_b32 s12, 1, 0 +; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], v4, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s10 +; CI-NEXT: s_and_b32 s7, s7, 0x7fff +; CI-NEXT: s_cmpk_lg_i32 s7, 0x7c00 +; CI-NEXT: s_cselect_b32 s7, 1, 0 +; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], v4, v5 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_mov_b32_e32 v4, 0x7e00 +; CI-NEXT: s_and_b32 s10, 1, s11 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s10 +; CI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] +; CI-NEXT: s_and_b32 s0, 1, s6 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v3 -; CI-NEXT: v_trunc_f32_e32 v5, v5 -; CI-NEXT: v_fma_f32 v3, -v5, v4, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; CI-NEXT: v_or_b32_e32 v1, v2, v1 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; CI-NEXT: s_and_b32 s0, 1, s12 +; CI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] +; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; CI-NEXT: s_and_b32 s0, 1, s7 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CI-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] +; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; CI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_or_b32_e32 v1, v1, v2 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[8:9], s[18:19], 0x0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; VI-NEXT: s_mov_b32 s0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; VI-NEXT: s_lshr_b32 s8, s4, 16 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: s_lshr_b32 s6, s2, 16 -; VI-NEXT: v_rcp_f32_e32 v3, v2 -; VI-NEXT: s_lshr_b32 s9, s5, 16 -; VI-NEXT: s_lshr_b32 s7, s3, 16 -; VI-NEXT: v_mul_f32_e32 v4, v0, v3 -; VI-NEXT: v_mad_f32 v5, -v2, v4, v0 -; VI-NEXT: v_mac_f32_e32 v4, v5, v3 -; VI-NEXT: v_mad_f32 v0, -v2, v4, v0 -; VI-NEXT: v_mul_f32_e32 v0, v0, v3 -; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0 -; VI-NEXT: v_add_f32_e32 v0, v0, v4 +; VI-NEXT: v_cvt_f32_f16_e64 v2, |s8| +; VI-NEXT: v_cvt_f32_f16_e64 v1, |s6| +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1 +; VI-NEXT: s_cbranch_vccz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_and_b32 s0, s8, 0x8000 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s8 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: .LBB10_2: ; %Flow135 +; VI-NEXT: s_xor_b32 s0, s0, 1 +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cbranch_scc1 .LBB10_8 +; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f32_e32 v3, v1 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1 +; VI-NEXT: v_ldexp_f32 v1, v3, 1 +; VI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, 1.0 +; VI-NEXT: v_frexp_mant_f32_e32 v0, v2 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5 +; VI-NEXT: v_ldexp_f32 v4, v0, 11 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v8, v3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; VI-NEXT: v_fma_f32 v8, v9, v8, v8 +; VI-NEXT: v_mul_f32_e32 v9, v7, v8 +; VI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; VI-NEXT: v_fma_f32 v9, v10, v8, v9 +; VI-NEXT: v_fma_f32 v3, -v3, v9, v7 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 +; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB10_6 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: .LBB10_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: v_mul_f32_e32 v4, v5, v3 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v6, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2 +; VI-NEXT: v_ldexp_f32 v4, v4, 11 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2 +; VI-NEXT: s_cbranch_vccnz .LBB10_5 +; VI-NEXT: s_branch .LBB10_7 +; VI-NEXT: .LBB10_6: +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: .LBB10_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2 +; VI-NEXT: v_ldexp_f32 v2, v5, v2 +; VI-NEXT: v_mul_f32_e32 v3, v2, v3 +; VI-NEXT: v_rndne_f32_e32 v3, v3 +; VI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; VI-NEXT: v_add_f32_e32 v1, v2, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; VI-NEXT: v_ldexp_f32 v0, v1, v0 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2 -; VI-NEXT: v_trunc_f16_e32 v0, v0 -; VI-NEXT: v_fma_f16 v0, -v0, v1, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; VI-NEXT: v_rcp_f32_e32 v4, v3 -; VI-NEXT: v_mul_f32_e32 v5, v1, v4 -; VI-NEXT: v_mad_f32 v6, -v3, v5, v1 -; VI-NEXT: v_mac_f32_e32 v5, v6, v4 -; VI-NEXT: v_mad_f32 v1, -v3, v5, v1 -; VI-NEXT: v_mul_f32_e32 v1, v1, v4 -; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; VI-NEXT: v_add_f32_e32 v1, v1, v5 +; VI-NEXT: s_and_b32 s0, s8, 0x8000 +; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; VI-NEXT: v_or_b32_e32 v0, s0, v0 +; VI-NEXT: .LBB10_8: ; %Flow136 +; VI-NEXT: s_lshr_b32 s4, s8, 16 +; VI-NEXT: s_lshr_b32 s2, s6, 16 +; VI-NEXT: v_cvt_f32_f16_e64 v3, |s4| +; VI-NEXT: v_cvt_f32_f16_e64 v2, |s2| +; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; VI-NEXT: s_cbranch_vccz .LBB10_10 +; VI-NEXT: ; %bb.9: ; %frem.else20 +; VI-NEXT: s_and_b32 s0, s4, 0x8000 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: .LBB10_10: ; %Flow131 +; VI-NEXT: s_xor_b32 s0, s0, 1 +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cbranch_scc1 .LBB10_16 +; VI-NEXT: ; %bb.11: ; %frem.compute19 +; VI-NEXT: v_frexp_mant_f32_e32 v4, v2 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 +; VI-NEXT: v_ldexp_f32 v2, v4, 1 +; VI-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, 1.0 +; VI-NEXT: v_frexp_mant_f32_e32 v1, v3 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v6 +; VI-NEXT: v_ldexp_f32 v5, v1, 11 +; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v7 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 +; VI-NEXT: v_rcp_f32_e32 v9, v4 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v10, -v4, v9, 1.0 +; VI-NEXT: v_fma_f32 v9, v10, v9, v9 +; VI-NEXT: v_mul_f32_e32 v10, v8, v9 +; VI-NEXT: v_fma_f32 v11, -v4, v10, v8 +; VI-NEXT: v_fma_f32 v10, v11, v9, v10 +; VI-NEXT: v_fma_f32 v4, -v4, v10, v8 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v4, v4, v9, v10 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3 +; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB10_14 +; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v6 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 +; VI-NEXT: .LBB10_13: ; %frem.loop_body27 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v6, v5 +; VI-NEXT: v_mul_f32_e32 v5, v6, v4 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v5, -v5, v2, v6 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; VI-NEXT: v_add_f32_e32 v7, v5, v2 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3 +; VI-NEXT: v_ldexp_f32 v5, v5, 11 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3 +; VI-NEXT: s_cbranch_vccnz .LBB10_13 +; VI-NEXT: s_branch .LBB10_15 +; VI-NEXT: .LBB10_14: +; VI-NEXT: v_mov_b32_e32 v6, v5 +; VI-NEXT: .LBB10_15: ; %frem.loop_exit28 +; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3 +; VI-NEXT: v_ldexp_f32 v3, v6, v3 +; VI-NEXT: v_mul_f32_e32 v4, v3, v4 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; VI-NEXT: v_add_f32_e32 v2, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; VI-NEXT: v_ldexp_f32 v1, v2, v1 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v4, s5 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s6 -; VI-NEXT: v_trunc_f16_e32 v1, v1 -; VI-NEXT: v_fma_f16 v1, -v1, v2, s6 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; VI-NEXT: v_rcp_f32_e32 v5, v4 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_mul_f32_e32 v6, v2, v5 -; VI-NEXT: v_mad_f32 v7, -v4, v6, v2 -; VI-NEXT: v_mac_f32_e32 v6, v7, v5 -; VI-NEXT: v_mad_f32 v2, -v4, v6, v2 -; VI-NEXT: v_mul_f32_e32 v2, v2, v5 -; VI-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; VI-NEXT: v_add_f32_e32 v2, v2, v6 +; VI-NEXT: s_and_b32 s0, s4, 0x8000 +; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v1, s0, v1 +; VI-NEXT: .LBB10_16: ; %Flow132 +; VI-NEXT: v_cvt_f32_f16_e64 v4, |s9| +; VI-NEXT: v_cvt_f32_f16_e64 v3, |s7| +; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 +; VI-NEXT: s_cbranch_vccz .LBB10_18 +; VI-NEXT: ; %bb.17: ; %frem.else53 +; VI-NEXT: s_and_b32 s0, s9, 0x8000 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: .LBB10_18: ; %Flow127 +; VI-NEXT: s_xor_b32 s0, s0, 1 +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cbranch_scc1 .LBB10_24 +; VI-NEXT: ; %bb.19: ; %frem.compute52 +; VI-NEXT: v_frexp_mant_f32_e32 v5, v3 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3 +; VI-NEXT: v_ldexp_f32 v3, v5, 1 +; VI-NEXT: v_div_scale_f32 v5, s[0:1], v3, v3, 1.0 +; VI-NEXT: v_frexp_mant_f32_e32 v2, v4 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 +; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v7 +; VI-NEXT: v_ldexp_f32 v6, v2, 11 +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v8 +; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v3, 1.0 +; VI-NEXT: v_rcp_f32_e32 v10, v5 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v11, -v5, v10, 1.0 +; VI-NEXT: v_fma_f32 v10, v11, v10, v10 +; VI-NEXT: v_mul_f32_e32 v11, v9, v10 +; VI-NEXT: v_fma_f32 v12, -v5, v11, v9 +; VI-NEXT: v_fma_f32 v11, v12, v10, v11 +; VI-NEXT: v_fma_f32 v5, -v5, v11, v9 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v5, v5, v10, v11 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v4 +; VI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB10_22 +; VI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; VI-NEXT: v_add_u32_e32 v4, vcc, 11, v7 +; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v8 +; VI-NEXT: .LBB10_21: ; %frem.loop_body60 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v7, v6 +; VI-NEXT: v_mul_f32_e32 v6, v7, v5 +; VI-NEXT: v_rndne_f32_e32 v6, v6 +; VI-NEXT: v_fma_f32 v6, -v6, v3, v7 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; VI-NEXT: v_add_f32_e32 v8, v6, v3 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, -11, v4 +; VI-NEXT: v_ldexp_f32 v6, v6, 11 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v4 +; VI-NEXT: s_cbranch_vccnz .LBB10_21 +; VI-NEXT: s_branch .LBB10_23 +; VI-NEXT: .LBB10_22: +; VI-NEXT: v_mov_b32_e32 v7, v6 +; VI-NEXT: .LBB10_23: ; %frem.loop_exit61 +; VI-NEXT: v_add_u32_e32 v4, vcc, -10, v4 +; VI-NEXT: v_ldexp_f32 v4, v7, v4 +; VI-NEXT: v_mul_f32_e32 v5, v4, v5 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v4, -v5, v3, v4 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; VI-NEXT: v_ldexp_f32 v2, v3, v2 ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; VI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; VI-NEXT: v_mov_b32_e32 v4, s9 -; VI-NEXT: v_div_fixup_f16 v2, v2, v3, s3 -; VI-NEXT: v_trunc_f16_e32 v2, v2 -; VI-NEXT: v_fma_f16 v2, -v2, v3, s3 -; VI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; VI-NEXT: v_rcp_f32_e32 v6, v5 -; VI-NEXT: v_mul_f32_e32 v7, v3, v6 -; VI-NEXT: v_mad_f32 v8, -v5, v7, v3 -; VI-NEXT: v_mac_f32_e32 v7, v8, v6 -; VI-NEXT: v_mad_f32 v3, -v5, v7, v3 -; VI-NEXT: v_mul_f32_e32 v3, v3, v6 -; VI-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; VI-NEXT: v_add_f32_e32 v3, v3, v7 +; VI-NEXT: s_and_b32 s0, s9, 0x8000 +; VI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v2, s0, v2 +; VI-NEXT: .LBB10_24: ; %Flow128 +; VI-NEXT: s_lshr_b32 s12, s9, 16 +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: v_cvt_f32_f16_e64 v5, |s12| +; VI-NEXT: v_cvt_f32_f16_e64 v4, |s10| +; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4 +; VI-NEXT: s_cbranch_vccz .LBB10_26 +; VI-NEXT: ; %bb.25: ; %frem.else86 +; VI-NEXT: s_and_b32 s0, s12, 0x8000 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: .LBB10_26: ; %Flow123 +; VI-NEXT: s_xor_b32 s0, s0, 1 +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cbranch_scc1 .LBB10_32 +; VI-NEXT: ; %bb.27: ; %frem.compute85 +; VI-NEXT: v_frexp_mant_f32_e32 v6, v4 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v9, v4 +; VI-NEXT: v_ldexp_f32 v4, v6, 1 +; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, 1.0 +; VI-NEXT: v_frexp_mant_f32_e32 v3, v5 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v8, v5 +; VI-NEXT: v_add_u32_e32 v5, vcc, -1, v8 +; VI-NEXT: v_ldexp_f32 v7, v3, 11 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v9 +; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0 +; VI-NEXT: v_rcp_f32_e32 v11, v6 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v12, -v6, v11, 1.0 +; VI-NEXT: v_fma_f32 v11, v12, v11, v11 +; VI-NEXT: v_mul_f32_e32 v12, v10, v11 +; VI-NEXT: v_fma_f32 v13, -v6, v12, v10 +; VI-NEXT: v_fma_f32 v12, v13, v11, v12 +; VI-NEXT: v_fma_f32 v6, -v6, v12, v10 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v6, v6, v11, v12 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v5 +; VI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB10_30 +; VI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; VI-NEXT: v_add_u32_e32 v5, vcc, 11, v8 +; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v9 +; VI-NEXT: .LBB10_29: ; %frem.loop_body93 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v8, v7 +; VI-NEXT: v_mul_f32_e32 v7, v8, v6 +; VI-NEXT: v_rndne_f32_e32 v7, v7 +; VI-NEXT: v_fma_f32 v7, -v7, v4, v8 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; VI-NEXT: v_add_f32_e32 v9, v7, v4 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, -11, v5 +; VI-NEXT: v_ldexp_f32 v7, v7, 11 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v5 +; VI-NEXT: s_cbranch_vccnz .LBB10_29 +; VI-NEXT: s_branch .LBB10_31 +; VI-NEXT: .LBB10_30: +; VI-NEXT: v_mov_b32_e32 v8, v7 +; VI-NEXT: .LBB10_31: ; %frem.loop_exit94 +; VI-NEXT: v_add_u32_e32 v5, vcc, -10, v5 +; VI-NEXT: v_ldexp_f32 v5, v8, v5 +; VI-NEXT: v_mul_f32_e32 v6, v5, v6 +; VI-NEXT: v_rndne_f32_e32 v6, v6 +; VI-NEXT: v_fma_f32 v5, -v6, v4, v5 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; VI-NEXT: v_add_f32_e32 v4, v5, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; VI-NEXT: v_ldexp_f32 v3, v4, v3 ; VI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; VI-NEXT: v_div_fixup_f16 v3, v3, v4, s7 -; VI-NEXT: v_trunc_f16_e32 v3, v3 -; VI-NEXT: v_fma_f16 v3, -v3, v4, s7 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_and_b32 s0, s12, 0x8000 +; VI-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v3, s0, v3 +; VI-NEXT: .LBB10_32: ; %Flow124 +; VI-NEXT: v_mov_b32_e32 v4, 0x7c00 +; VI-NEXT: v_cmp_nlg_f16_e64 vcc, s6, 0 +; VI-NEXT: v_cmp_nge_f16_e64 s[0:1], |s8|, v4 +; VI-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, 0 +; VI-NEXT: v_cmp_nge_f16_e64 s[4:5], |s4|, v4 +; VI-NEXT: v_cmp_nge_f16_e64 s[8:9], |s9|, v4 +; VI-NEXT: v_cmp_nge_f16_e64 s[12:13], |s12|, v4 +; VI-NEXT: v_mov_b32_e32 v4, 0x7e00 +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_mov_b64 vcc, s[2:3] +; VI-NEXT: v_cmp_nlg_f16_e64 s[6:7], s7, 0 +; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_mov_b64 vcc, s[4:5] +; VI-NEXT: v_cmp_nlg_f16_e64 s[10:11], s10, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_mov_b64 vcc, s[6:7] +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_mov_b64 vcc, s[10:11] +; VI-NEXT: v_cndmask_b32_sdwa v2, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_mov_b64 vcc, s[12:13] +; VI-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[8:9] +; VI-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4 @@ -791,43 +2138,171 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s6, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2 -; CI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 -; CI-NEXT: v_rcp_f32_e32 v3, v1 +; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0| +; CI-NEXT: ; implicit-def: $vgpr0 +; CI-NEXT: s_cbranch_vccz .LBB11_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: s_and_b32 s6, s2, 0x80000000 +; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1| +; CI-NEXT: v_mov_b32_e32 v1, s6 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: .LBB11_2: ; %Flow53 +; CI-NEXT: s_xor_b32 s6, s6, 1 +; CI-NEXT: s_cmp_lg_u32 s6, 0 +; CI-NEXT: s_cbranch_scc1 .LBB11_8 +; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f32_e64 v1, |s4| +; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1 +; CI-NEXT: v_div_scale_f32 v3, s[6:7], v1, v1, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v0, |s2| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s4| +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5 +; CI-NEXT: v_ldexp_f32_e64 v4, v0, 12 +; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; CI-NEXT: v_rcp_f32_e32 v8, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; CI-NEXT: v_fma_f32 v3, v4, v3, v3 -; CI-NEXT: v_mul_f32_e32 v4, v2, v3 -; CI-NEXT: v_fma_f32 v5, -v1, v4, v2 -; CI-NEXT: v_fma_f32 v4, v5, v3, v4 -; CI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; CI-NEXT: v_fma_f32 v8, v9, v8, v8 +; CI-NEXT: v_mul_f32_e32 v9, v7, v8 +; CI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; CI-NEXT: v_fma_f32 v9, v10, v8, v9 +; CI-NEXT: v_fma_f32 v3, -v3, v9, v7 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s2 -; CI-NEXT: v_trunc_f32_e32 v1, v1 -; CI-NEXT: v_fma_f32 v0, -v1, v0, s2 +; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 +; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB11_6 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; CI-NEXT: .LBB11_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: v_mul_f32_e32 v4, v5, v3 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v6, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, -12, v2 +; CI-NEXT: v_ldexp_f32_e64 v4, v4, 12 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2 +; CI-NEXT: s_cbranch_vccnz .LBB11_5 +; CI-NEXT: s_branch .LBB11_7 +; CI-NEXT: .LBB11_6: +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: .LBB11_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2 +; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 +; CI-NEXT: v_mul_f32_e32 v3, v2, v3 +; CI-NEXT: v_rndne_f32_e32 v3, v3 +; CI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; CI-NEXT: v_add_f32_e32 v1, v2, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; CI-NEXT: s_and_b32 s6, s2, 0x80000000 +; CI-NEXT: v_or_b32_e32 v0, s6, v0 +; CI-NEXT: .LBB11_8: ; %Flow54 ; CI-NEXT: v_mov_b32_e32 v1, s5 -; CI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, s3 -; CI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3 -; CI-NEXT: v_rcp_f32_e32 v4, v2 +; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s3|, |v1| +; CI-NEXT: s_mov_b32 s6, 1 +; CI-NEXT: ; implicit-def: $vgpr1 +; CI-NEXT: s_cbranch_vccz .LBB11_10 +; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: s_and_b32 s6, s3, 0x80000000 +; CI-NEXT: v_mov_b32_e32 v2, s5 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2| +; CI-NEXT: v_mov_b32_e32 v2, s6 +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: .LBB11_10: ; %Flow49 +; CI-NEXT: s_xor_b32 s6, s6, 1 +; CI-NEXT: s_cmp_lg_u32 s6, 0 +; CI-NEXT: s_cbranch_scc1 .LBB11_16 +; CI-NEXT: ; %bb.11: ; %frem.compute15 +; CI-NEXT: v_frexp_mant_f32_e64 v2, |s5| +; CI-NEXT: v_ldexp_f32_e64 v2, v2, 1 +; CI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v1, |s3| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s5| +; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6 +; CI-NEXT: v_ldexp_f32_e64 v5, v1, 12 +; CI-NEXT: v_add_i32_e32 v1, vcc, -1, v7 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v1 +; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 +; CI-NEXT: v_rcp_f32_e32 v9, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; CI-NEXT: v_fma_f32 v4, v5, v4, v4 -; CI-NEXT: v_mul_f32_e32 v5, v3, v4 -; CI-NEXT: v_fma_f32 v6, -v2, v5, v3 -; CI-NEXT: v_fma_f32 v5, v6, v4, v5 -; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 +; CI-NEXT: v_fma_f32 v10, -v4, v9, 1.0 +; CI-NEXT: v_fma_f32 v9, v10, v9, v9 +; CI-NEXT: v_mul_f32_e32 v10, v8, v9 +; CI-NEXT: v_fma_f32 v11, -v4, v10, v8 +; CI-NEXT: v_fma_f32 v10, v11, v9, v10 +; CI-NEXT: v_fma_f32 v4, -v4, v10, v8 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; CI-NEXT: v_div_fmas_f32 v4, v4, v9, v10 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3 +; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB11_14 +; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: v_add_i32_e32 v3, vcc, 12, v6 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; CI-NEXT: .LBB11_13: ; %frem.loop_body23 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v6, v5 +; CI-NEXT: v_mul_f32_e32 v5, v6, v4 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v5, -v5, v2, v6 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v7, v5, v2 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; CI-NEXT: v_add_i32_e32 v3, vcc, -12, v3 +; CI-NEXT: v_ldexp_f32_e64 v5, v5, 12 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3 +; CI-NEXT: s_cbranch_vccnz .LBB11_13 +; CI-NEXT: s_branch .LBB11_15 +; CI-NEXT: .LBB11_14: +; CI-NEXT: v_mov_b32_e32 v6, v5 +; CI-NEXT: .LBB11_15: ; %frem.loop_exit24 +; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3 +; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3 +; CI-NEXT: v_mul_f32_e32 v4, v3, v4 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; CI-NEXT: v_add_f32_e32 v2, v3, v2 +; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; CI-NEXT: s_and_b32 s6, s3, 0x80000000 +; CI-NEXT: v_or_b32_e32 v1, s6, v1 +; CI-NEXT: .LBB11_16: ; %Flow50 +; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; CI-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s2|, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s5, 0 +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s3|, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: v_div_fixup_f32 v2, v2, v1, s3 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v1, -v2, v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -836,42 +2311,170 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s6, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2 -; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 -; VI-NEXT: v_rcp_f32_e32 v3, v1 +; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0| +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: s_cbranch_vccz .LBB11_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_and_b32 s6, s2, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1| +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: s_mov_b32 s6, 0 +; VI-NEXT: .LBB11_2: ; %Flow53 +; VI-NEXT: s_xor_b32 s6, s6, 1 +; VI-NEXT: s_cmp_lg_u32 s6, 0 +; VI-NEXT: s_cbranch_scc1 .LBB11_8 +; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f32_e64 v1, |s4| +; VI-NEXT: v_ldexp_f32 v1, v1, 1 +; VI-NEXT: v_div_scale_f32 v3, s[6:7], v1, v1, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v0, |s2| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s4| +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5 +; VI-NEXT: v_ldexp_f32 v4, v0, 12 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v8, v3 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; VI-NEXT: v_fma_f32 v3, v4, v3, v3 -; VI-NEXT: v_mul_f32_e32 v4, v2, v3 -; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 -; VI-NEXT: v_fma_f32 v4, v5, v3, v4 -; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; VI-NEXT: v_fma_f32 v8, v9, v8, v8 +; VI-NEXT: v_mul_f32_e32 v9, v7, v8 +; VI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; VI-NEXT: v_fma_f32 v9, v10, v8, v9 +; VI-NEXT: v_fma_f32 v3, -v3, v9, v7 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2 -; VI-NEXT: v_trunc_f32_e32 v1, v1 -; VI-NEXT: v_fma_f32 v0, -v1, v0, s2 +; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 +; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB11_6 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: .LBB11_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: v_mul_f32_e32 v4, v5, v3 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v6, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, -12, v2 +; VI-NEXT: v_ldexp_f32 v4, v4, 12 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2 +; VI-NEXT: s_cbranch_vccnz .LBB11_5 +; VI-NEXT: s_branch .LBB11_7 +; VI-NEXT: .LBB11_6: +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: .LBB11_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2 +; VI-NEXT: v_ldexp_f32 v2, v5, v2 +; VI-NEXT: v_mul_f32_e32 v3, v2, v3 +; VI-NEXT: v_rndne_f32_e32 v3, v3 +; VI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; VI-NEXT: v_add_f32_e32 v1, v2, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; VI-NEXT: v_ldexp_f32 v0, v1, v0 +; VI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; VI-NEXT: s_and_b32 s6, s2, 0x80000000 +; VI-NEXT: v_or_b32_e32 v0, s6, v0 +; VI-NEXT: .LBB11_8: ; %Flow54 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, s3 -; VI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3 -; VI-NEXT: v_rcp_f32_e32 v4, v2 +; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s3|, |v1| +; VI-NEXT: s_mov_b32 s6, 1 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: s_cbranch_vccz .LBB11_10 +; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: s_and_b32 s6, s3, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2| +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: s_mov_b32 s6, 0 +; VI-NEXT: .LBB11_10: ; %Flow49 +; VI-NEXT: s_xor_b32 s6, s6, 1 +; VI-NEXT: s_cmp_lg_u32 s6, 0 +; VI-NEXT: s_cbranch_scc1 .LBB11_16 +; VI-NEXT: ; %bb.11: ; %frem.compute15 +; VI-NEXT: v_frexp_mant_f32_e64 v2, |s5| +; VI-NEXT: v_ldexp_f32 v2, v2, 1 +; VI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v1, |s3| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s5| +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v6 +; VI-NEXT: v_ldexp_f32 v5, v1, 12 +; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v7 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 +; VI-NEXT: v_rcp_f32_e32 v9, v4 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; VI-NEXT: v_fma_f32 v4, v5, v4, v4 -; VI-NEXT: v_mul_f32_e32 v5, v3, v4 -; VI-NEXT: v_fma_f32 v6, -v2, v5, v3 -; VI-NEXT: v_fma_f32 v5, v6, v4, v5 -; VI-NEXT: v_fma_f32 v2, -v2, v5, v3 +; VI-NEXT: v_fma_f32 v10, -v4, v9, 1.0 +; VI-NEXT: v_fma_f32 v9, v10, v9, v9 +; VI-NEXT: v_mul_f32_e32 v10, v8, v9 +; VI-NEXT: v_fma_f32 v11, -v4, v10, v8 +; VI-NEXT: v_fma_f32 v10, v11, v9, v10 +; VI-NEXT: v_fma_f32 v4, -v4, v10, v8 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s3 -; VI-NEXT: v_trunc_f32_e32 v2, v2 -; VI-NEXT: v_fma_f32 v1, -v2, v1, s3 +; VI-NEXT: v_div_fmas_f32 v4, v4, v9, v10 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3 +; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB11_14 +; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: v_add_u32_e32 v3, vcc, 12, v6 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 +; VI-NEXT: .LBB11_13: ; %frem.loop_body23 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v6, v5 +; VI-NEXT: v_mul_f32_e32 v5, v6, v4 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v5, -v5, v2, v6 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; VI-NEXT: v_add_f32_e32 v7, v5, v2 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, -12, v3 +; VI-NEXT: v_ldexp_f32 v5, v5, 12 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3 +; VI-NEXT: s_cbranch_vccnz .LBB11_13 +; VI-NEXT: s_branch .LBB11_15 +; VI-NEXT: .LBB11_14: +; VI-NEXT: v_mov_b32_e32 v6, v5 +; VI-NEXT: .LBB11_15: ; %frem.loop_exit24 +; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3 +; VI-NEXT: v_ldexp_f32 v3, v6, v3 +; VI-NEXT: v_mul_f32_e32 v4, v3, v4 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; VI-NEXT: v_add_f32_e32 v2, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; VI-NEXT: v_ldexp_f32 v1, v2, v1 +; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; VI-NEXT: s_and_b32 s6, s3, 0x80000000 +; VI-NEXT: v_or_b32_e32 v1, s6, v1 +; VI-NEXT: .LBB11_16: ; %Flow50 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; VI-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s2|, v3 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s5, 0 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s3|, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -892,73 +2495,327 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 +; CI-NEXT: s_mov_b32 s2, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s4 -; CI-NEXT: v_div_scale_f32 v2, vcc, s4, v0, s4 -; CI-NEXT: v_rcp_f32_e32 v3, v1 +; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s4|, |v0| +; CI-NEXT: ; implicit-def: $vgpr0 +; CI-NEXT: s_cbranch_vccz .LBB12_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: s_and_b32 s2, s4, 0x80000000 +; CI-NEXT: v_mov_b32_e32 v1, s8 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1| +; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: .LBB12_2: ; %Flow127 +; CI-NEXT: s_xor_b32 s2, s2, 1 +; CI-NEXT: s_cmp_lg_u32 s2, 0 +; CI-NEXT: s_cbranch_scc1 .LBB12_8 +; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f32_e64 v1, |s8| +; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1 +; CI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v0, |s4| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s4| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s8| +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5 +; CI-NEXT: v_ldexp_f32_e64 v4, v0, 12 +; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; CI-NEXT: v_rcp_f32_e32 v8, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; CI-NEXT: v_fma_f32 v3, v4, v3, v3 -; CI-NEXT: v_mul_f32_e32 v4, v2, v3 -; CI-NEXT: v_fma_f32 v5, -v1, v4, v2 -; CI-NEXT: v_fma_f32 v4, v5, v3, v4 -; CI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; CI-NEXT: v_fma_f32 v8, v9, v8, v8 +; CI-NEXT: v_mul_f32_e32 v9, v7, v8 +; CI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; CI-NEXT: v_fma_f32 v9, v10, v8, v9 +; CI-NEXT: v_fma_f32 v3, -v3, v9, v7 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s4 -; CI-NEXT: v_trunc_f32_e32 v1, v1 -; CI-NEXT: v_fma_f32 v0, -v1, v0, s4 +; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 +; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB12_6 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5 +; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; CI-NEXT: .LBB12_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: v_mul_f32_e32 v4, v5, v3 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v6, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, -12, v2 +; CI-NEXT: v_ldexp_f32_e64 v4, v4, 12 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2 +; CI-NEXT: s_cbranch_vccnz .LBB12_5 +; CI-NEXT: s_branch .LBB12_7 +; CI-NEXT: .LBB12_6: +; CI-NEXT: v_mov_b32_e32 v5, v4 +; CI-NEXT: .LBB12_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2 +; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 +; CI-NEXT: v_mul_f32_e32 v3, v2, v3 +; CI-NEXT: v_rndne_f32_e32 v3, v3 +; CI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; CI-NEXT: v_add_f32_e32 v1, v2, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; CI-NEXT: s_and_b32 s2, s4, 0x80000000 +; CI-NEXT: v_or_b32_e32 v0, s2, v0 +; CI-NEXT: .LBB12_8: ; %Flow128 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, s5 -; CI-NEXT: v_div_scale_f32 v3, vcc, s5, v1, s5 -; CI-NEXT: v_rcp_f32_e32 v4, v2 +; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s5|, |v1| +; CI-NEXT: s_mov_b32 s2, 1 +; CI-NEXT: ; implicit-def: $vgpr1 +; CI-NEXT: s_cbranch_vccz .LBB12_10 +; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: s_and_b32 s2, s5, 0x80000000 +; CI-NEXT: v_mov_b32_e32 v2, s9 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2| +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: .LBB12_10: ; %Flow123 +; CI-NEXT: s_xor_b32 s2, s2, 1 +; CI-NEXT: s_cmp_lg_u32 s2, 0 +; CI-NEXT: s_cbranch_scc1 .LBB12_16 +; CI-NEXT: ; %bb.11: ; %frem.compute15 +; CI-NEXT: v_frexp_mant_f32_e64 v2, |s9| +; CI-NEXT: v_ldexp_f32_e64 v2, v2, 1 +; CI-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v1, |s5| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s5| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s9| +; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6 +; CI-NEXT: v_ldexp_f32_e64 v5, v1, 12 +; CI-NEXT: v_add_i32_e32 v1, vcc, -1, v7 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v1 +; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 +; CI-NEXT: v_rcp_f32_e32 v9, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; CI-NEXT: v_fma_f32 v4, v5, v4, v4 -; CI-NEXT: v_mul_f32_e32 v5, v3, v4 -; CI-NEXT: v_fma_f32 v6, -v2, v5, v3 -; CI-NEXT: v_fma_f32 v5, v6, v4, v5 -; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 +; CI-NEXT: v_fma_f32 v10, -v4, v9, 1.0 +; CI-NEXT: v_fma_f32 v9, v10, v9, v9 +; CI-NEXT: v_mul_f32_e32 v10, v8, v9 +; CI-NEXT: v_fma_f32 v11, -v4, v10, v8 +; CI-NEXT: v_fma_f32 v10, v11, v9, v10 +; CI-NEXT: v_fma_f32 v4, -v4, v10, v8 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; CI-NEXT: v_div_fixup_f32 v2, v2, v1, s5 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v1, -v2, v1, s5 +; CI-NEXT: v_div_fmas_f32 v4, v4, v9, v10 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3 +; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB12_14 +; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: v_add_i32_e32 v3, vcc, 12, v6 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; CI-NEXT: .LBB12_13: ; %frem.loop_body23 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v6, v5 +; CI-NEXT: v_mul_f32_e32 v5, v6, v4 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v5, -v5, v2, v6 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v7, v5, v2 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; CI-NEXT: v_add_i32_e32 v3, vcc, -12, v3 +; CI-NEXT: v_ldexp_f32_e64 v5, v5, 12 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3 +; CI-NEXT: s_cbranch_vccnz .LBB12_13 +; CI-NEXT: s_branch .LBB12_15 +; CI-NEXT: .LBB12_14: +; CI-NEXT: v_mov_b32_e32 v6, v5 +; CI-NEXT: .LBB12_15: ; %frem.loop_exit24 +; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3 +; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3 +; CI-NEXT: v_mul_f32_e32 v4, v3, v4 +; CI-NEXT: v_rndne_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; CI-NEXT: v_add_f32_e32 v2, v3, v2 +; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; CI-NEXT: s_and_b32 s2, s5, 0x80000000 +; CI-NEXT: v_or_b32_e32 v1, s2, v1 +; CI-NEXT: .LBB12_16: ; %Flow124 ; CI-NEXT: v_mov_b32_e32 v2, s10 -; CI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, s6 -; CI-NEXT: v_div_scale_f32 v4, vcc, s6, v2, s6 -; CI-NEXT: v_rcp_f32_e32 v5, v3 +; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s6|, |v2| +; CI-NEXT: s_mov_b32 s2, 1 +; CI-NEXT: ; implicit-def: $vgpr2 +; CI-NEXT: s_cbranch_vccz .LBB12_18 +; CI-NEXT: ; %bb.17: ; %frem.else47 +; CI-NEXT: s_and_b32 s2, s6, 0x80000000 +; CI-NEXT: v_mov_b32_e32 v3, s10 +; CI-NEXT: v_mov_b32_e32 v2, s6 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3| +; CI-NEXT: v_mov_b32_e32 v3, s2 +; CI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: .LBB12_18: ; %Flow119 +; CI-NEXT: s_xor_b32 s2, s2, 1 +; CI-NEXT: s_cmp_lg_u32 s2, 0 +; CI-NEXT: s_cbranch_scc1 .LBB12_24 +; CI-NEXT: ; %bb.19: ; %frem.compute46 +; CI-NEXT: v_frexp_mant_f32_e64 v3, |s10| +; CI-NEXT: v_ldexp_f32_e64 v3, v3, 1 +; CI-NEXT: v_div_scale_f32 v5, s[2:3], v3, v3, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v2, |s6| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s6| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v8, |s10| +; CI-NEXT: v_add_i32_e32 v4, vcc, -1, v7 +; CI-NEXT: v_ldexp_f32_e64 v6, v2, 12 +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v8 +; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 +; CI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v3, 1.0 +; CI-NEXT: v_rcp_f32_e32 v10, v5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; CI-NEXT: v_fma_f32 v5, v6, v5, v5 -; CI-NEXT: v_mul_f32_e32 v6, v4, v5 -; CI-NEXT: v_fma_f32 v7, -v3, v6, v4 -; CI-NEXT: v_fma_f32 v6, v7, v5, v6 -; CI-NEXT: v_fma_f32 v3, -v3, v6, v4 +; CI-NEXT: v_fma_f32 v11, -v5, v10, 1.0 +; CI-NEXT: v_fma_f32 v10, v11, v10, v10 +; CI-NEXT: v_mul_f32_e32 v11, v9, v10 +; CI-NEXT: v_fma_f32 v12, -v5, v11, v9 +; CI-NEXT: v_fma_f32 v11, v12, v10, v11 +; CI-NEXT: v_fma_f32 v5, -v5, v11, v9 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; CI-NEXT: v_div_fixup_f32 v3, v3, v2, s6 -; CI-NEXT: v_trunc_f32_e32 v3, v3 -; CI-NEXT: v_fma_f32 v2, -v3, v2, s6 +; CI-NEXT: v_div_fmas_f32 v5, v5, v10, v11 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v4 +; CI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB12_22 +; CI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; CI-NEXT: v_add_i32_e32 v4, vcc, 12, v7 +; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 +; CI-NEXT: .LBB12_21: ; %frem.loop_body54 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v7, v6 +; CI-NEXT: v_mul_f32_e32 v6, v7, v5 +; CI-NEXT: v_rndne_f32_e32 v6, v6 +; CI-NEXT: v_fma_f32 v6, -v6, v3, v7 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; CI-NEXT: v_add_f32_e32 v8, v6, v3 +; CI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, -12, v4 +; CI-NEXT: v_ldexp_f32_e64 v6, v6, 12 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v4 +; CI-NEXT: s_cbranch_vccnz .LBB12_21 +; CI-NEXT: s_branch .LBB12_23 +; CI-NEXT: .LBB12_22: +; CI-NEXT: v_mov_b32_e32 v7, v6 +; CI-NEXT: .LBB12_23: ; %frem.loop_exit55 +; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4 +; CI-NEXT: v_ldexp_f32_e32 v4, v7, v4 +; CI-NEXT: v_mul_f32_e32 v5, v4, v5 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v4, -v5, v3, v4 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v3, v4, v3 +; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; CI-NEXT: v_ldexp_f32_e32 v2, v3, v2 +; CI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; CI-NEXT: s_and_b32 s2, s6, 0x80000000 +; CI-NEXT: v_or_b32_e32 v2, s2, v2 +; CI-NEXT: .LBB12_24: ; %Flow120 ; CI-NEXT: v_mov_b32_e32 v3, s11 -; CI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, s7 -; CI-NEXT: v_div_scale_f32 v5, vcc, s7, v3, s7 -; CI-NEXT: v_rcp_f32_e32 v6, v4 +; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s7|, |v3| +; CI-NEXT: s_mov_b32 s2, 1 +; CI-NEXT: ; implicit-def: $vgpr3 +; CI-NEXT: s_cbranch_vccz .LBB12_26 +; CI-NEXT: ; %bb.25: ; %frem.else78 +; CI-NEXT: s_and_b32 s2, s7, 0x80000000 +; CI-NEXT: v_mov_b32_e32 v4, s11 +; CI-NEXT: v_mov_b32_e32 v3, s7 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4| +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: .LBB12_26: ; %Flow115 +; CI-NEXT: s_xor_b32 s2, s2, 1 +; CI-NEXT: s_cmp_lg_u32 s2, 0 +; CI-NEXT: s_cbranch_scc1 .LBB12_32 +; CI-NEXT: ; %bb.27: ; %frem.compute77 +; CI-NEXT: v_frexp_mant_f32_e64 v4, |s11| +; CI-NEXT: v_ldexp_f32_e64 v4, v4, 1 +; CI-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v3, |s7| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v8, |s7| +; CI-NEXT: v_frexp_exp_i32_f32_e64 v9, |s11| +; CI-NEXT: v_add_i32_e32 v5, vcc, -1, v8 +; CI-NEXT: v_ldexp_f32_e64 v7, v3, 12 +; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v9 +; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v3 +; CI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0 +; CI-NEXT: v_rcp_f32_e32 v11, v6 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v7, -v4, v6, 1.0 -; CI-NEXT: v_fma_f32 v6, v7, v6, v6 -; CI-NEXT: v_mul_f32_e32 v7, v5, v6 -; CI-NEXT: v_fma_f32 v8, -v4, v7, v5 -; CI-NEXT: v_fma_f32 v7, v8, v6, v7 -; CI-NEXT: v_fma_f32 v4, -v4, v7, v5 +; CI-NEXT: v_fma_f32 v12, -v6, v11, 1.0 +; CI-NEXT: v_fma_f32 v11, v12, v11, v11 +; CI-NEXT: v_mul_f32_e32 v12, v10, v11 +; CI-NEXT: v_fma_f32 v13, -v6, v12, v10 +; CI-NEXT: v_fma_f32 v12, v13, v11, v12 +; CI-NEXT: v_fma_f32 v6, -v6, v12, v10 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; CI-NEXT: v_div_fmas_f32 v6, v6, v11, v12 +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v5 +; CI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB12_30 +; CI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; CI-NEXT: v_add_i32_e32 v5, vcc, 12, v8 +; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; CI-NEXT: .LBB12_29: ; %frem.loop_body85 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v8, v7 +; CI-NEXT: v_mul_f32_e32 v7, v8, v6 +; CI-NEXT: v_rndne_f32_e32 v7, v7 +; CI-NEXT: v_fma_f32 v7, -v7, v4, v8 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; CI-NEXT: v_add_f32_e32 v9, v7, v4 +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; CI-NEXT: v_add_i32_e32 v5, vcc, -12, v5 +; CI-NEXT: v_ldexp_f32_e64 v7, v7, 12 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v5 +; CI-NEXT: s_cbranch_vccnz .LBB12_29 +; CI-NEXT: s_branch .LBB12_31 +; CI-NEXT: .LBB12_30: +; CI-NEXT: v_mov_b32_e32 v8, v7 +; CI-NEXT: .LBB12_31: ; %frem.loop_exit86 +; CI-NEXT: v_add_i32_e32 v5, vcc, -11, v5 +; CI-NEXT: v_ldexp_f32_e32 v5, v8, v5 +; CI-NEXT: v_mul_f32_e32 v6, v5, v6 +; CI-NEXT: v_rndne_f32_e32 v6, v6 +; CI-NEXT: v_fma_f32 v5, -v6, v4, v5 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v4, v5, v4 +; CI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; CI-NEXT: v_ldexp_f32_e32 v3, v4, v3 +; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; CI-NEXT: s_and_b32 s2, s7, 0x80000000 +; CI-NEXT: v_or_b32_e32 v3, s2, v3 +; CI-NEXT: .LBB12_32: ; %Flow116 +; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s8, 0 +; CI-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; CI-NEXT: v_mov_b32_e32 v5, 0x7f800000 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s4|, v5 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s9, 0 +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s5|, v5 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s10, 0 +; CI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s6|, v5 +; CI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s11, 0 +; CI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s7|, v5 +; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_div_fixup_f32 v4, v4, v3, s7 -; CI-NEXT: v_trunc_f32_e32 v4, v4 -; CI-NEXT: v_fma_f32 v3, -v4, v3, s7 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; @@ -969,71 +2826,325 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s4 -; VI-NEXT: v_div_scale_f32 v2, vcc, s4, v0, s4 -; VI-NEXT: v_rcp_f32_e32 v3, v1 +; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s4|, |v0| +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: s_cbranch_vccz .LBB12_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1| +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: .LBB12_2: ; %Flow127 +; VI-NEXT: s_xor_b32 s2, s2, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB12_8 +; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f32_e64 v1, |s8| +; VI-NEXT: v_ldexp_f32 v1, v1, 1 +; VI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v0, |s4| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s4| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s8| +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5 +; VI-NEXT: v_ldexp_f32 v4, v0, 12 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v8, v3 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; VI-NEXT: v_fma_f32 v3, v4, v3, v3 -; VI-NEXT: v_mul_f32_e32 v4, v2, v3 -; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 -; VI-NEXT: v_fma_f32 v4, v5, v3, v4 -; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0 +; VI-NEXT: v_fma_f32 v8, v9, v8, v8 +; VI-NEXT: v_mul_f32_e32 v9, v7, v8 +; VI-NEXT: v_fma_f32 v10, -v3, v9, v7 +; VI-NEXT: v_fma_f32 v9, v10, v8, v9 +; VI-NEXT: v_fma_f32 v3, -v3, v9, v7 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s4 -; VI-NEXT: v_trunc_f32_e32 v1, v1 -; VI-NEXT: v_fma_f32 v0, -v1, v0, s4 +; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 +; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB12_6 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; VI-NEXT: .LBB12_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: v_mul_f32_e32 v4, v5, v3 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v4, -v4, v1, v5 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v6, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, -12, v2 +; VI-NEXT: v_ldexp_f32 v4, v4, 12 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2 +; VI-NEXT: s_cbranch_vccnz .LBB12_5 +; VI-NEXT: s_branch .LBB12_7 +; VI-NEXT: .LBB12_6: +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: .LBB12_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2 +; VI-NEXT: v_ldexp_f32 v2, v5, v2 +; VI-NEXT: v_mul_f32_e32 v3, v2, v3 +; VI-NEXT: v_rndne_f32_e32 v3, v3 +; VI-NEXT: v_fma_f32 v2, -v3, v1, v2 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2 +; VI-NEXT: v_add_f32_e32 v1, v2, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; VI-NEXT: v_ldexp_f32 v0, v1, v0 +; VI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: v_or_b32_e32 v0, s2, v0 +; VI-NEXT: .LBB12_8: ; %Flow128 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, s5 -; VI-NEXT: v_div_scale_f32 v3, vcc, s5, v1, s5 -; VI-NEXT: v_rcp_f32_e32 v4, v2 +; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s5|, |v1| +; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: s_cbranch_vccz .LBB12_10 +; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: s_and_b32 s2, s5, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v2, s9 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2| +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: .LBB12_10: ; %Flow123 +; VI-NEXT: s_xor_b32 s2, s2, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB12_16 +; VI-NEXT: ; %bb.11: ; %frem.compute15 +; VI-NEXT: v_frexp_mant_f32_e64 v2, |s9| +; VI-NEXT: v_ldexp_f32 v2, v2, 1 +; VI-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v1, |s5| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s5| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s9| +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v6 +; VI-NEXT: v_ldexp_f32 v5, v1, 12 +; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v7 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 +; VI-NEXT: v_rcp_f32_e32 v9, v4 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; VI-NEXT: v_fma_f32 v4, v5, v4, v4 -; VI-NEXT: v_mul_f32_e32 v5, v3, v4 -; VI-NEXT: v_fma_f32 v6, -v2, v5, v3 -; VI-NEXT: v_fma_f32 v5, v6, v4, v5 -; VI-NEXT: v_fma_f32 v2, -v2, v5, v3 +; VI-NEXT: v_fma_f32 v10, -v4, v9, 1.0 +; VI-NEXT: v_fma_f32 v9, v10, v9, v9 +; VI-NEXT: v_mul_f32_e32 v10, v8, v9 +; VI-NEXT: v_fma_f32 v11, -v4, v10, v8 +; VI-NEXT: v_fma_f32 v10, v11, v9, v10 +; VI-NEXT: v_fma_f32 v4, -v4, v10, v8 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s5 -; VI-NEXT: v_trunc_f32_e32 v2, v2 -; VI-NEXT: v_fma_f32 v1, -v2, v1, s5 +; VI-NEXT: v_div_fmas_f32 v4, v4, v9, v10 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3 +; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB12_14 +; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: v_add_u32_e32 v3, vcc, 12, v6 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 +; VI-NEXT: .LBB12_13: ; %frem.loop_body23 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v6, v5 +; VI-NEXT: v_mul_f32_e32 v5, v6, v4 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v5, -v5, v2, v6 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; VI-NEXT: v_add_f32_e32 v7, v5, v2 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, -12, v3 +; VI-NEXT: v_ldexp_f32 v5, v5, 12 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3 +; VI-NEXT: s_cbranch_vccnz .LBB12_13 +; VI-NEXT: s_branch .LBB12_15 +; VI-NEXT: .LBB12_14: +; VI-NEXT: v_mov_b32_e32 v6, v5 +; VI-NEXT: .LBB12_15: ; %frem.loop_exit24 +; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3 +; VI-NEXT: v_ldexp_f32 v3, v6, v3 +; VI-NEXT: v_mul_f32_e32 v4, v3, v4 +; VI-NEXT: v_rndne_f32_e32 v4, v4 +; VI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; VI-NEXT: v_add_f32_e32 v2, v3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; VI-NEXT: v_ldexp_f32 v1, v2, v1 +; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; VI-NEXT: s_and_b32 s2, s5, 0x80000000 +; VI-NEXT: v_or_b32_e32 v1, s2, v1 +; VI-NEXT: .LBB12_16: ; %Flow124 ; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, s6 -; VI-NEXT: v_div_scale_f32 v4, vcc, s6, v2, s6 -; VI-NEXT: v_rcp_f32_e32 v5, v3 +; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s6|, |v2| +; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: s_cbranch_vccz .LBB12_18 +; VI-NEXT: ; %bb.17: ; %frem.else47 +; VI-NEXT: s_and_b32 s2, s6, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v3, s10 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3| +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: .LBB12_18: ; %Flow119 +; VI-NEXT: s_xor_b32 s2, s2, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB12_24 +; VI-NEXT: ; %bb.19: ; %frem.compute46 +; VI-NEXT: v_frexp_mant_f32_e64 v3, |s10| +; VI-NEXT: v_ldexp_f32 v3, v3, 1 +; VI-NEXT: v_div_scale_f32 v5, s[2:3], v3, v3, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v2, |s6| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s6| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v8, |s10| +; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v7 +; VI-NEXT: v_ldexp_f32 v6, v2, 12 +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v8 +; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v3, 1.0 +; VI-NEXT: v_rcp_f32_e32 v10, v5 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; VI-NEXT: v_fma_f32 v5, v6, v5, v5 -; VI-NEXT: v_mul_f32_e32 v6, v4, v5 -; VI-NEXT: v_fma_f32 v7, -v3, v6, v4 -; VI-NEXT: v_fma_f32 v6, v7, v5, v6 -; VI-NEXT: v_fma_f32 v3, -v3, v6, v4 +; VI-NEXT: v_fma_f32 v11, -v5, v10, 1.0 +; VI-NEXT: v_fma_f32 v10, v11, v10, v10 +; VI-NEXT: v_mul_f32_e32 v11, v9, v10 +; VI-NEXT: v_fma_f32 v12, -v5, v11, v9 +; VI-NEXT: v_fma_f32 v11, v12, v10, v11 +; VI-NEXT: v_fma_f32 v5, -v5, v11, v9 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; VI-NEXT: v_div_fixup_f32 v3, v3, v2, s6 -; VI-NEXT: v_trunc_f32_e32 v3, v3 -; VI-NEXT: v_fma_f32 v2, -v3, v2, s6 +; VI-NEXT: v_div_fmas_f32 v5, v5, v10, v11 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v4 +; VI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB12_22 +; VI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; VI-NEXT: v_add_u32_e32 v4, vcc, 12, v7 +; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v8 +; VI-NEXT: .LBB12_21: ; %frem.loop_body54 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v7, v6 +; VI-NEXT: v_mul_f32_e32 v6, v7, v5 +; VI-NEXT: v_rndne_f32_e32 v6, v6 +; VI-NEXT: v_fma_f32 v6, -v6, v3, v7 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; VI-NEXT: v_add_f32_e32 v8, v6, v3 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, -12, v4 +; VI-NEXT: v_ldexp_f32 v6, v6, 12 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v4 +; VI-NEXT: s_cbranch_vccnz .LBB12_21 +; VI-NEXT: s_branch .LBB12_23 +; VI-NEXT: .LBB12_22: +; VI-NEXT: v_mov_b32_e32 v7, v6 +; VI-NEXT: .LBB12_23: ; %frem.loop_exit55 +; VI-NEXT: v_add_u32_e32 v4, vcc, -11, v4 +; VI-NEXT: v_ldexp_f32 v4, v7, v4 +; VI-NEXT: v_mul_f32_e32 v5, v4, v5 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v4, -v5, v3, v4 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; VI-NEXT: v_ldexp_f32 v2, v3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; VI-NEXT: s_and_b32 s2, s6, 0x80000000 +; VI-NEXT: v_or_b32_e32 v2, s2, v2 +; VI-NEXT: .LBB12_24: ; %Flow120 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, s7 -; VI-NEXT: v_div_scale_f32 v5, vcc, s7, v3, s7 -; VI-NEXT: v_rcp_f32_e32 v6, v4 +; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s7|, |v3| +; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: s_cbranch_vccz .LBB12_26 +; VI-NEXT: ; %bb.25: ; %frem.else78 +; VI-NEXT: s_and_b32 s2, s7, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v4, s11 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4| +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: .LBB12_26: ; %Flow115 +; VI-NEXT: s_xor_b32 s2, s2, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB12_32 +; VI-NEXT: ; %bb.27: ; %frem.compute77 +; VI-NEXT: v_frexp_mant_f32_e64 v4, |s11| +; VI-NEXT: v_ldexp_f32 v4, v4, 1 +; VI-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v3, |s7| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v8, |s7| +; VI-NEXT: v_frexp_exp_i32_f32_e64 v9, |s11| +; VI-NEXT: v_add_u32_e32 v5, vcc, -1, v8 +; VI-NEXT: v_ldexp_f32 v7, v3, 12 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v9 +; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0 +; VI-NEXT: v_rcp_f32_e32 v11, v6 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v7, -v4, v6, 1.0 -; VI-NEXT: v_fma_f32 v6, v7, v6, v6 -; VI-NEXT: v_mul_f32_e32 v7, v5, v6 -; VI-NEXT: v_fma_f32 v8, -v4, v7, v5 -; VI-NEXT: v_fma_f32 v7, v8, v6, v7 -; VI-NEXT: v_fma_f32 v4, -v4, v7, v5 +; VI-NEXT: v_fma_f32 v12, -v6, v11, 1.0 +; VI-NEXT: v_fma_f32 v11, v12, v11, v11 +; VI-NEXT: v_mul_f32_e32 v12, v10, v11 +; VI-NEXT: v_fma_f32 v13, -v6, v12, v10 +; VI-NEXT: v_fma_f32 v12, v13, v11, v12 +; VI-NEXT: v_fma_f32 v6, -v6, v12, v10 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 -; VI-NEXT: v_div_fixup_f32 v4, v4, v3, s7 -; VI-NEXT: v_trunc_f32_e32 v4, v4 -; VI-NEXT: v_fma_f32 v3, -v4, v3, s7 +; VI-NEXT: v_div_fmas_f32 v6, v6, v11, v12 +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v5 +; VI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB12_30 +; VI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; VI-NEXT: v_add_u32_e32 v5, vcc, 12, v8 +; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v9 +; VI-NEXT: .LBB12_29: ; %frem.loop_body85 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v8, v7 +; VI-NEXT: v_mul_f32_e32 v7, v8, v6 +; VI-NEXT: v_rndne_f32_e32 v7, v7 +; VI-NEXT: v_fma_f32 v7, -v7, v4, v8 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; VI-NEXT: v_add_f32_e32 v9, v7, v4 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, -12, v5 +; VI-NEXT: v_ldexp_f32 v7, v7, 12 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v5 +; VI-NEXT: s_cbranch_vccnz .LBB12_29 +; VI-NEXT: s_branch .LBB12_31 +; VI-NEXT: .LBB12_30: +; VI-NEXT: v_mov_b32_e32 v8, v7 +; VI-NEXT: .LBB12_31: ; %frem.loop_exit86 +; VI-NEXT: v_add_u32_e32 v5, vcc, -11, v5 +; VI-NEXT: v_ldexp_f32 v5, v8, v5 +; VI-NEXT: v_mul_f32_e32 v6, v5, v6 +; VI-NEXT: v_rndne_f32_e32 v6, v6 +; VI-NEXT: v_fma_f32 v5, -v6, v4, v5 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; VI-NEXT: v_add_f32_e32 v4, v5, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; VI-NEXT: v_ldexp_f32 v3, v4, v3 +; VI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; VI-NEXT: s_and_b32 s2, s7, 0x80000000 +; VI-NEXT: v_or_b32_e32 v3, s2, v3 +; VI-NEXT: .LBB12_32: ; %Flow116 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s8, 0 +; VI-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; VI-NEXT: v_mov_b32_e32 v5, 0x7f800000 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s4|, v5 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s9, 0 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s5|, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s10, 0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s6|, v5 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s11, 0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s7|, v5 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1054,39 +3165,202 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 +; CI-NEXT: s_mov_b32 s2, 1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[4:5] -; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[4:5], v[0:1], s[4:5] -; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[4:5] -; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] -; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[4:5] +; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]| +; CI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CI-NEXT: s_cbranch_vccz .LBB13_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]| +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_brev_b32 s3, 1 +; CI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: .LBB13_2: ; %Flow53 +; CI-NEXT: s_xor_b32 s2, s2, 1 +; CI-NEXT: s_cmp_lg_u32 s2, 0 +; CI-NEXT: s_cbranch_scc1 .LBB13_8 +; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]| +; CI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[4:5]| +; CI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[8:9]| +; CI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26 +; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[8:9]| +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v6 +; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v7 +; CI-NEXT: v_sub_i32_e32 v9, vcc, v2, v8 +; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1 +; CI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], 1.0 +; CI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0 +; CI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3] +; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0 +; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0 +; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; CI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15] +; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13] +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9 +; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB13_6 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_add_i32_e32 v6, vcc, 26, v6 +; CI-NEXT: v_sub_i32_e32 v9, vcc, v6, v7 +; CI-NEXT: .LBB13_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v7, v5 +; CI-NEXT: v_mov_b32_e32 v6, v4 +; CI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3] +; CI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5] +; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; CI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1] +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; CI-NEXT: v_add_i32_e32 v9, vcc, 0xffffffe6, v9 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9 +; CI-NEXT: s_cbranch_vccnz .LBB13_5 +; CI-NEXT: s_branch .LBB13_7 +; CI-NEXT: .LBB13_6: +; CI-NEXT: v_mov_b32_e32 v7, v5 +; CI-NEXT: v_mov_b32_e32 v6, v4 +; CI-NEXT: .LBB13_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe7, v9 +; CI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_brev_b32 s3, 1 +; CI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; CI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3] +; CI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] +; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3] +; CI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 +; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; CI-NEXT: v_or_b32_e32 v0, s2, v0 +; CI-NEXT: v_or_b32_e32 v1, s3, v1 +; CI-NEXT: .LBB13_8: ; %Flow54 +; CI-NEXT: v_mov_b32_e32 v2, s10 +; CI-NEXT: v_mov_b32_e32 v3, s11 +; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[6:7]|, |v[2:3]| +; CI-NEXT: s_mov_b32 s2, 1 +; CI-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CI-NEXT: s_cbranch_vccz .LBB13_10 +; CI-NEXT: ; %bb.9: ; %frem.else16 ; CI-NEXT: v_mov_b32_e32 v2, s10 ; CI-NEXT: v_mov_b32_e32 v3, s11 -; CI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], s[6:7] -; CI-NEXT: v_div_scale_f64 v[10:11], vcc, s[6:7], v[2:3], s[6:7] +; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]| +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_brev_b32 s3, 1 +; CI-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3] +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_mov_b32_e32 v4, s6 +; CI-NEXT: v_mov_b32_e32 v5, s7 +; CI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: .LBB13_10: ; %Flow49 +; CI-NEXT: s_xor_b32 s2, s2, 1 +; CI-NEXT: s_cmp_lg_u32 s2, 0 +; CI-NEXT: s_cbranch_scc1 .LBB13_16 +; CI-NEXT: ; %bb.11: ; %frem.compute15 +; CI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[6:7]| +; CI-NEXT: v_frexp_exp_i32_f64_e64 v8, |s[6:7]| +; CI-NEXT: v_frexp_exp_i32_f64_e64 v9, |s[10:11]| +; CI-NEXT: v_ldexp_f64 v[6:7], v[2:3], 26 +; CI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[10:11]| +; CI-NEXT: v_add_i32_e32 v4, vcc, -1, v8 +; CI-NEXT: v_add_i32_e32 v10, vcc, -1, v9 +; CI-NEXT: v_sub_i32_e32 v11, vcc, v4, v10 +; CI-NEXT: v_ldexp_f64 v[2:3], v[2:3], 1 +; CI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], 1.0 +; CI-NEXT: v_div_scale_f64 v[16:17], vcc, 1.0, v[2:3], 1.0 +; CI-NEXT: v_rcp_f64_e32 v[12:13], v[4:5] +; CI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0 +; CI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; CI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0 +; CI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; CI-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] +; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[12:13], v[14:15] +; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v11 +; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB13_14 +; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: v_add_i32_e32 v8, vcc, 26, v8 +; CI-NEXT: v_sub_i32_e32 v11, vcc, v8, v9 +; CI-NEXT: .LBB13_13: ; %frem.loop_body23 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v9, v7 +; CI-NEXT: v_mov_b32_e32 v8, v6 +; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; CI-NEXT: v_rndne_f64_e32 v[6:7], v[6:7] +; CI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[2:3], v[8:9] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; CI-NEXT: v_add_f64 v[12:13], v[6:7], v[2:3] +; CI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc +; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; CI-NEXT: v_add_i32_e32 v11, vcc, 0xffffffe6, v11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v11 +; CI-NEXT: s_cbranch_vccnz .LBB13_13 +; CI-NEXT: s_branch .LBB13_15 +; CI-NEXT: .LBB13_14: +; CI-NEXT: v_mov_b32_e32 v9, v7 +; CI-NEXT: v_mov_b32_e32 v8, v6 +; CI-NEXT: .LBB13_15: ; %frem.loop_exit24 +; CI-NEXT: v_add_i32_e32 v6, vcc, 0xffffffe7, v11 +; CI-NEXT: v_ldexp_f64 v[6:7], v[8:9], v6 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_brev_b32 s3, 1 +; CI-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3] +; CI-NEXT: v_mul_f64 v[4:5], v[6:7], v[4:5] +; CI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5] +; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[2:3], v[6:7] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; CI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CI-NEXT: v_ldexp_f64 v[2:3], v[2:3], v10 +; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; CI-NEXT: v_or_b32_e32 v2, s2, v2 +; CI-NEXT: v_or_b32_e32 v3, s3, v3 +; CI-NEXT: .LBB13_16: ; %Flow50 +; CI-NEXT: v_cmp_nlg_f64_e64 vcc, s[8:9], 0 +; CI-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_mov_b32 s3, 0x7ff00000 +; CI-NEXT: v_cndmask_b32_e64 v5, v0, 0, vcc +; CI-NEXT: v_cndmask_b32_e32 v6, v1, v4, vcc +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[2:3] +; CI-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc +; CI-NEXT: v_cmp_nlg_f64_e64 vcc, s[10:11], 0 +; CI-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc +; CI-NEXT: v_cndmask_b32_e32 v6, v3, v4, vcc +; CI-NEXT: v_mov_b32_e32 v2, s6 +; CI-NEXT: v_mov_b32_e32 v3, s7 +; CI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[2:3] ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; CI-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] -; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] -; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] -; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[6:7] -; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] -; CI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[6:7] +; CI-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc +; CI-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; @@ -1097,37 +3371,200 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[4:5] -; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[4:5], v[0:1], s[4:5] -; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[4:5] -; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] -; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[4:5] +; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]| +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: s_cbranch_vccz .LBB13_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]| +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_brev_b32 s3, 1 +; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: .LBB13_2: ; %Flow53 +; VI-NEXT: s_xor_b32 s2, s2, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB13_8 +; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]| +; VI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[4:5]| +; VI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[8:9]| +; VI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26 +; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[8:9]| +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v6 +; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v7 +; VI-NEXT: v_sub_u32_e32 v9, vcc, v2, v8 +; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1 +; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], 1.0 +; VI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0 +; VI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3] +; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0 +; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0 +; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; VI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15] +; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13] +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9 +; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB13_6 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_add_u32_e32 v6, vcc, 26, v6 +; VI-NEXT: v_sub_u32_e32 v9, vcc, v6, v7 +; VI-NEXT: .LBB13_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3] +; VI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5] +; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; VI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1] +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0xffffffe6, v9 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9 +; VI-NEXT: s_cbranch_vccnz .LBB13_5 +; VI-NEXT: s_branch .LBB13_7 +; VI-NEXT: .LBB13_6: +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: .LBB13_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe7, v9 +; VI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_brev_b32 s3, 1 +; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; VI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3] +; VI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] +; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 +; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; VI-NEXT: v_or_b32_e32 v0, s2, v0 +; VI-NEXT: v_or_b32_e32 v1, s3, v1 +; VI-NEXT: .LBB13_8: ; %Flow54 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], s[6:7] -; VI-NEXT: v_div_scale_f64 v[10:11], vcc, s[6:7], v[2:3], s[6:7] -; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; VI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; VI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; VI-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] -; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] -; VI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] -; VI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[6:7] -; VI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] -; VI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[6:7] +; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[6:7]|, |v[2:3]| +; VI-NEXT: s_mov_b32 s2, 1 +; VI-NEXT: ; implicit-def: $vgpr2_vgpr3 +; VI-NEXT: s_cbranch_vccz .LBB13_10 +; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]| +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_brev_b32 s3, 1 +; VI-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: .LBB13_10: ; %Flow49 +; VI-NEXT: s_xor_b32 s2, s2, 1 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cbranch_scc1 .LBB13_16 +; VI-NEXT: ; %bb.11: ; %frem.compute15 +; VI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[6:7]| +; VI-NEXT: v_frexp_exp_i32_f64_e64 v8, |s[6:7]| +; VI-NEXT: v_frexp_exp_i32_f64_e64 v9, |s[10:11]| +; VI-NEXT: v_ldexp_f64 v[6:7], v[2:3], 26 +; VI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[10:11]| +; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v8 +; VI-NEXT: v_add_u32_e32 v10, vcc, -1, v9 +; VI-NEXT: v_sub_u32_e32 v11, vcc, v4, v10 +; VI-NEXT: v_ldexp_f64 v[2:3], v[2:3], 1 +; VI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], 1.0 +; VI-NEXT: v_div_scale_f64 v[16:17], vcc, 1.0, v[2:3], 1.0 +; VI-NEXT: v_rcp_f64_e32 v[12:13], v[4:5] +; VI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0 +; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; VI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0 +; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; VI-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] +; VI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[12:13], v[14:15] +; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v11 +; VI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB13_14 +; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: v_add_u32_e32 v8, vcc, 26, v8 +; VI-NEXT: v_sub_u32_e32 v11, vcc, v8, v9 +; VI-NEXT: .LBB13_13: ; %frem.loop_body23 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v9, v7 +; VI-NEXT: v_mov_b32_e32 v8, v6 +; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; VI-NEXT: v_rndne_f64_e32 v[6:7], v[6:7] +; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[2:3], v[8:9] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_add_f64 v[12:13], v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc +; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0xffffffe6, v11 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v11 +; VI-NEXT: s_cbranch_vccnz .LBB13_13 +; VI-NEXT: s_branch .LBB13_15 +; VI-NEXT: .LBB13_14: +; VI-NEXT: v_mov_b32_e32 v9, v7 +; VI-NEXT: v_mov_b32_e32 v8, v6 +; VI-NEXT: .LBB13_15: ; %frem.loop_exit24 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0xffffffe7, v11 +; VI-NEXT: v_ldexp_f64 v[6:7], v[8:9], v6 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_brev_b32 s3, 1 +; VI-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3] +; VI-NEXT: v_mul_f64 v[4:5], v[6:7], v[4:5] +; VI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5] +; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[2:3], v[6:7] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] +; VI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; VI-NEXT: v_ldexp_f64 v[2:3], v[2:3], v10 +; VI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; VI-NEXT: v_or_b32_e32 v2, s2, v2 +; VI-NEXT: v_or_b32_e32 v3, s3, v3 +; VI-NEXT: .LBB13_16: ; %Flow50 +; VI-NEXT: v_cmp_nlg_f64_e64 vcc, s[8:9], 0 +; VI-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_mov_b32 s3, 0x7ff00000 +; VI-NEXT: v_cndmask_b32_e64 v5, v0, 0, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v1, v4, vcc +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[2:3] +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc +; VI-NEXT: v_cmp_nlg_f64_e64 vcc, s[10:11], 0 +; VI-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v3, v4, vcc +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[2:3] +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll index c429b1a32bde6..39365fe7b6f15 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll @@ -579,18 +579,159 @@ define amdgpu_kernel void @fdiv_constant_sel_constants(ptr addrspace(1) %p, i1 % ret void } +; ExpandFp now expands frem before it reaches dagcombine. +; TODO Implement this optimization in/before ExpandFP define amdgpu_kernel void @frem_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) { -; GCN-LABEL: frem_constant_sel_constants: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp1_b32 s2, 0 -; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, s[2:3] -; GCN-NEXT: global_store_dword v0, v1, s[0:1] -; GCN-NEXT: s_endpgm +; GFX9-LABEL: frem_constant_sel_constants: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40400000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_bitcmp1_b32 s0, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v0, -4.0, s[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0x40a00000 +; GFX9-NEXT: v_cmp_nlt_f32_e64 s[2:3], |v1|, s0 +; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX9-NEXT: s_cbranch_vccz .LBB26_2 +; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40a00000 +; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], |v1|, s0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB26_3 +; GFX9-NEXT: s_branch .LBB26_7 +; GFX9-NEXT: .LBB26_2: +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: .LBB26_3: ; %frem.compute +; GFX9-NEXT: v_frexp_mant_f32_e64 v0, |v1| +; GFX9-NEXT: v_ldexp_f32 v0, v0, 1 +; GFX9-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0 +; GFX9-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX9-NEXT: v_rcp_f32_e32 v4, v2 +; GFX9-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX9-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX9-NEXT: v_fma_f32 v6, -v2, v5, v3 +; GFX9-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX9-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX9-NEXT: v_div_fmas_f32 v3, v2, v4, v5 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v2, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, 3, v2 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x45200000 +; GFX9-NEXT: v_div_fixup_f32 v3, v3, v0, 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB26_6 +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: v_sub_u32_e32 v1, 15, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x45200000 +; GFX9-NEXT: .LBB26_5: ; %frem.loop_body +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX9-NEXT: v_rndne_f32_e32 v5, v5 +; GFX9-NEXT: v_fma_f32 v5, -v5, v0, v4 +; GFX9-NEXT: v_add_f32_e32 v6, v5, v0 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; GFX9-NEXT: v_add_u32_e32 v1, -12, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 12, v1 +; GFX9-NEXT: v_ldexp_f32 v5, v5, 12 +; GFX9-NEXT: s_cbranch_vccnz .LBB26_5 +; GFX9-NEXT: .LBB26_6: ; %Flow12 +; GFX9-NEXT: v_add_u32_e32 v1, -11, v1 +; GFX9-NEXT: v_ldexp_f32 v1, v4, v1 +; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX9-NEXT: v_rndne_f32_e32 v3, v3 +; GFX9-NEXT: v_fma_f32 v1, -v3, v0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v1 +; GFX9-NEXT: v_add_u32_e32 v2, -1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX9-NEXT: .LBB26_7: ; %Flow14 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX942-LABEL: frem_constant_sel_constants: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX942-NEXT: v_mov_b32_e32 v0, 0x40400000 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_bitcmp1_b32 s0, 0 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: v_cndmask_b32_e64 v1, v0, -4.0, s[0:1] +; GFX942-NEXT: s_mov_b32 s0, 0x40a00000 +; GFX942-NEXT: v_cmp_nlt_f32_e64 s[2:3], |v1|, s0 +; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX942-NEXT: s_cbranch_vccz .LBB26_2 +; GFX942-NEXT: ; %bb.1: ; %frem.else +; GFX942-NEXT: v_mov_b32_e32 v0, 0x40a00000 +; GFX942-NEXT: v_cmp_eq_f32_e64 s[0:1], |v1|, s0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB26_3 +; GFX942-NEXT: s_branch .LBB26_7 +; GFX942-NEXT: .LBB26_2: +; GFX942-NEXT: ; implicit-def: $vgpr0 +; GFX942-NEXT: .LBB26_3: ; %frem.compute +; GFX942-NEXT: v_frexp_mant_f32_e64 v0, |v1| +; GFX942-NEXT: v_ldexp_f32 v0, v0, 1 +; GFX942-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, 1.0 +; GFX942-NEXT: v_rcp_f32_e32 v4, v3 +; GFX942-NEXT: v_frexp_exp_i32_f32_e32 v2, v1 +; GFX942-NEXT: v_sub_u32_e32 v1, 3, v2 +; GFX942-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX942-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX942-NEXT: v_div_scale_f32 v5, vcc, 1.0, v0, 1.0 +; GFX942-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX942-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX942-NEXT: v_fmac_f32_e32 v6, v7, v4 +; GFX942-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX942-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX942-NEXT: v_cmp_gt_i32_e32 vcc, 13, v1 +; GFX942-NEXT: v_div_fixup_f32 v3, v3, v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x45200000 +; GFX942-NEXT: s_cbranch_vccnz .LBB26_6 +; GFX942-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX942-NEXT: v_sub_u32_e32 v1, 15, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x45200000 +; GFX942-NEXT: .LBB26_5: ; %frem.loop_body +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_mov_b32_e32 v4, v5 +; GFX942-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX942-NEXT: v_rndne_f32_e32 v5, v5 +; GFX942-NEXT: v_fma_f32 v5, -v5, v0, v4 +; GFX942-NEXT: v_add_f32_e32 v6, v5, v0 +; GFX942-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; GFX942-NEXT: v_add_u32_e32 v1, -12, v1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GFX942-NEXT: v_cmp_lt_i32_e32 vcc, 12, v1 +; GFX942-NEXT: v_ldexp_f32 v5, v5, 12 +; GFX942-NEXT: s_cbranch_vccnz .LBB26_5 +; GFX942-NEXT: .LBB26_6: ; %Flow12 +; GFX942-NEXT: v_add_u32_e32 v1, -11, v1 +; GFX942-NEXT: v_ldexp_f32 v1, v4, v1 +; GFX942-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX942-NEXT: v_rndne_f32_e32 v3, v3 +; GFX942-NEXT: v_fma_f32 v1, -v3, v0, v1 +; GFX942-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX942-NEXT: v_cmp_gt_f32_e32 vcc, 0, v1 +; GFX942-NEXT: v_add_u32_e32 v2, -1, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX942-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX942-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX942-NEXT: .LBB26_7: ; %Flow14 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dword v1, v0, s[0:1] +; GFX942-NEXT: s_endpgm %sel = select i1 %cond, float -4.0, float 3.0 %bo = frem float 5.0, %sel store float %bo, ptr addrspace(1) %p, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/freeze-binary.ll b/llvm/test/CodeGen/AMDGPU/freeze-binary.ll index b799d6e6b6e9d..93e1e4bcc0236 100644 --- a/llvm/test/CodeGen/AMDGPU/freeze-binary.ll +++ b/llvm/test/CodeGen/AMDGPU/freeze-binary.ll @@ -142,56 +142,108 @@ define <8 x float> @freeze_frem_vec(<8 x float> %input) nounwind { ; CHECK-LABEL: freeze_frem_vec: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_dual_mul_f32 v8, 0x3e800000, v4 :: v_dual_mul_f32 v9, 0x3e800000, v3 -; CHECK-NEXT: v_trunc_f32_e32 v11, v0 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; CHECK-NEXT: v_trunc_f32_e32 v8, v8 +; CHECK-NEXT: v_div_scale_f32 v11, null, 0x40400000, 0x40400000, v5 +; CHECK-NEXT: v_div_scale_f32 v8, null, 0x40400000, 0x40400000, v2 +; CHECK-NEXT: v_div_scale_f32 v20, s0, v5, 0x40400000, v5 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; CHECK-NEXT: v_rcp_f32_e32 v13, v11 +; CHECK-NEXT: v_rcp_f32_e32 v10, v8 +; CHECK-NEXT: v_div_scale_f32 v14, vcc_lo, v2, 0x40400000, v2 +; CHECK-NEXT: v_trunc_f32_e32 v12, v0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_sub_f32_e32 v0, v0, v12 +; CHECK-NEXT: s_waitcnt_depctr 0xfff +; CHECK-NEXT: v_fma_f32 v17, -v11, v13, 1.0 +; CHECK-NEXT: v_fmac_f32_e32 v13, v17, v13 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_dual_mul_f32 v15, 0.5, v6 :: v_dual_mul_f32 v22, v20, v13 +; CHECK-NEXT: v_trunc_f32_e32 v15, v15 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_dual_mul_f32 v9, 0.5, v1 :: v_dual_fmac_f32 v6, -2.0, v15 ; CHECK-NEXT: v_trunc_f32_e32 v9, v9 -; CHECK-NEXT: v_mul_f32_e32 v10, 0.5, v6 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; CHECK-NEXT: v_dual_sub_f32 v0, v0, v11 :: v_dual_mul_f32 v11, 0x3eaaaaab, v5 -; CHECK-NEXT: v_dual_fmac_f32 v4, -4.0, v8 :: v_dual_fmac_f32 v3, -4.0, v9 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_trunc_f32_e32 v10, v10 -; CHECK-NEXT: v_trunc_f32_e32 v9, v7 -; CHECK-NEXT: v_dual_fmac_f32 v6, -2.0, v10 :: v_dual_sub_f32 v7, v7, v9 -; CHECK-NEXT: v_mul_f32_e32 v8, 0.5, v1 -; CHECK-NEXT: v_trunc_f32_e32 v9, v11 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; CHECK-NEXT: v_div_scale_f32 v21, null, 0x40400000, 0x40400000, v6 +; CHECK-NEXT: v_fmac_f32_e32 v1, -2.0, v9 +; CHECK-NEXT: v_fma_f32 v9, -v8, v10, 1.0 +; CHECK-NEXT: v_div_scale_f32 v12, s2, v6, 0x40400000, v6 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; CHECK-NEXT: v_rcp_f32_e32 v24, v21 +; CHECK-NEXT: v_fmac_f32_e32 v10, v9, v10 +; CHECK-NEXT: v_div_scale_f32 v16, null, 0x40400000, 0x40400000, v1 +; CHECK-NEXT: v_div_scale_f32 v23, s1, v1, 0x40400000, v1 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; CHECK-NEXT: v_mul_f32_e32 v11, 0x3e800000, v7 +; CHECK-NEXT: v_mul_f32_e32 v19, v14, v10 +; CHECK-NEXT: v_rcp_f32_e32 v18, v16 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_fma_f32 v15, -v8, v19, v14 +; CHECK-NEXT: v_fmac_f32_e32 v19, v15, v10 +; CHECK-NEXT: s_waitcnt_depctr 0xfff +; CHECK-NEXT: v_fma_f32 v15, -v16, v18, 1.0 +; CHECK-NEXT: v_mul_f32_e32 v9, 0x3e800000, v3 +; CHECK-NEXT: v_fma_f32 v8, -v8, v19, v14 +; CHECK-NEXT: v_fma_f32 v14, -v11, v22, v20 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; CHECK-NEXT: v_fmac_f32_e32 v18, v15, v18 +; CHECK-NEXT: v_trunc_f32_e32 v9, v9 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; CHECK-NEXT: v_div_fmas_f32 v8, v8, v10, v19 +; CHECK-NEXT: s_mov_b32 vcc_lo, s0 +; CHECK-NEXT: v_dual_mul_f32 v10, v23, v18 :: v_dual_mul_f32 v17, 0x3e800000, v4 +; CHECK-NEXT: v_fmac_f32_e32 v22, v14, v13 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; CHECK-NEXT: v_div_fixup_f32 v8, v8, 0x40400000, v2 +; CHECK-NEXT: v_fma_f32 v14, -v21, v24, 1.0 +; CHECK-NEXT: v_fma_f32 v15, -v16, v10, v23 +; CHECK-NEXT: v_fmac_f32_e32 v3, -4.0, v9 +; CHECK-NEXT: v_fma_f32 v11, -v11, v22, v20 ; CHECK-NEXT: v_trunc_f32_e32 v8, v8 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; CHECK-NEXT: v_fmac_f32_e32 v1, -2.0, v8 -; CHECK-NEXT: v_fmac_f32_e32 v5, 0xc0400000, v9 -; CHECK-NEXT: v_mul_f32_e32 v10, 0x3eaaaaab, v2 -; CHECK-NEXT: v_mul_f32_e32 v12, 0x3e800000, v0 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; CHECK-NEXT: v_trunc_f32_e32 v8, v10 -; CHECK-NEXT: v_trunc_f32_e32 v10, v12 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; CHECK-NEXT: v_fmac_f32_e32 v24, v14, v24 +; CHECK-NEXT: v_fmac_f32_e32 v10, v15, v18 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; CHECK-NEXT: v_div_fmas_f32 v11, v11, v13, v22 ; CHECK-NEXT: v_fmac_f32_e32 v2, 0xc0400000, v8 -; CHECK-NEXT: v_trunc_f32_e32 v8, v11 -; CHECK-NEXT: v_mul_f32_e32 v12, 0x3eaaaaab, v1 -; CHECK-NEXT: v_dual_fmac_f32 v0, -4.0, v10 :: v_dual_mul_f32 v11, 0.5, v5 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; CHECK-NEXT: v_fmac_f32_e32 v7, -4.0, v8 -; CHECK-NEXT: v_trunc_f32_e32 v9, v12 -; CHECK-NEXT: v_mul_f32_e32 v12, 0x3eaaaaab, v6 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; CHECK-NEXT: v_fmac_f32_e32 v1, 0xc0400000, v9 -; CHECK-NEXT: v_trunc_f32_e32 v9, v11 -; CHECK-NEXT: v_trunc_f32_e32 v11, v3 -; CHECK-NEXT: v_dual_mul_f32 v10, 0.5, v2 :: v_dual_fmac_f32 v5, -2.0, v9 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; CHECK-NEXT: v_sub_f32_e32 v3, v3, v11 +; CHECK-NEXT: v_trunc_f32_e32 v13, v17 +; CHECK-NEXT: s_mov_b32 vcc_lo, s1 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; CHECK-NEXT: v_div_fixup_f32 v8, v11, 0x40400000, v5 +; CHECK-NEXT: v_mul_f32_e32 v11, v12, v24 +; CHECK-NEXT: v_fma_f32 v9, -v16, v10, v23 +; CHECK-NEXT: v_trunc_f32_e32 v8, v8 +; CHECK-NEXT: v_fmac_f32_e32 v4, -4.0, v13 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; CHECK-NEXT: v_fma_f32 v13, -v21, v11, v12 +; CHECK-NEXT: v_div_fmas_f32 v9, v9, v18, v10 +; CHECK-NEXT: v_mul_f32_e32 v10, 0x3e800000, v0 +; CHECK-NEXT: v_fmac_f32_e32 v5, 0xc0400000, v8 +; CHECK-NEXT: v_trunc_f32_e32 v8, v7 +; CHECK-NEXT: v_fmac_f32_e32 v11, v13, v24 +; CHECK-NEXT: v_div_fixup_f32 v9, v9, 0x40400000, v1 +; CHECK-NEXT: v_mul_f32_e32 v13, 0.5, v2 +; CHECK-NEXT: s_mov_b32 vcc_lo, s2 +; CHECK-NEXT: v_sub_f32_e32 v7, v7, v8 ; CHECK-NEXT: v_trunc_f32_e32 v8, v10 -; CHECK-NEXT: v_trunc_f32_e32 v10, v12 +; CHECK-NEXT: v_trunc_f32_e32 v9, v9 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; CHECK-NEXT: v_dual_fmac_f32 v0, -4.0, v8 :: v_dual_fmac_f32 v1, 0xc0400000, v9 +; CHECK-NEXT: v_mul_f32_e32 v8, 0.5, v5 +; CHECK-NEXT: v_fma_f32 v10, -v21, v11, v12 +; CHECK-NEXT: v_trunc_f32_e32 v12, v13 +; CHECK-NEXT: v_trunc_f32_e32 v8, v8 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; CHECK-NEXT: v_div_fmas_f32 v10, v10, v24, v11 +; CHECK-NEXT: v_trunc_f32_e32 v11, v3 +; CHECK-NEXT: v_fmac_f32_e32 v2, -2.0, v12 ; CHECK-NEXT: v_trunc_f32_e32 v12, v4 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; CHECK-NEXT: v_fmac_f32_e32 v2, -2.0, v8 -; CHECK-NEXT: v_fmac_f32_e32 v6, 0xc0400000, v10 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) -; CHECK-NEXT: v_sub_f32_e32 v4, v4, v12 +; CHECK-NEXT: v_fmac_f32_e32 v5, -2.0, v8 +; CHECK-NEXT: v_div_fixup_f32 v9, v10, 0x40400000, v6 +; CHECK-NEXT: v_sub_f32_e32 v3, v3, v11 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_trunc_f32_e32 v9, v9 +; CHECK-NEXT: v_fmac_f32_e32 v6, 0xc0400000, v9 +; CHECK-NEXT: v_mul_f32_e32 v10, 0x3e800000, v7 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_trunc_f32_e32 v10, v10 +; CHECK-NEXT: v_dual_sub_f32 v4, v4, v12 :: v_dual_fmac_f32 v7, -4.0, v10 ; CHECK-NEXT: s_setpc_b64 s[30:31] %x = frem reassoc nsz arcp contract afn <8 x float> %input, %y = freeze <8 x float> %x diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 35913b9a21d30..20009aee6e7ff 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -14,82 +14,231 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s0, s2 -; SI-NEXT: s_mov_b32 s1, s3 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s10 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 -; SI-NEXT: v_rcp_f32_e32 v4, v3 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 +; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v1| +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: s_cbranch_vccz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: v_bfi_b32 v5, s0, 0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB0_3 +; SI-NEXT: s_branch .LBB0_8 +; SI-NEXT: .LBB0_2: +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB0_3: ; %frem.compute +; SI-NEXT: s_mov_b32 s3, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v3 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s0, v4 +; SI-NEXT: s_cselect_b32 s2, s0, 0 +; SI-NEXT: v_frexp_mant_f32_e32 v4, v3 +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; SI-NEXT: v_ldexp_f32_e64 v3, v3, 11 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v2|, s3 +; SI-NEXT: v_frexp_mant_f32_e32 v4, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc +; SI-NEXT: v_frexp_exp_i32_f32_e32 v2, v2 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s0, v2 +; SI-NEXT: s_cselect_b32 s3, s0, 0 +; SI-NEXT: s_add_i32 s0, s3, -1 +; SI-NEXT: v_ldexp_f32_e64 v2, v4, 1 +; SI-NEXT: s_not_b32 s1, s0 +; SI-NEXT: s_add_i32 s1, s1, s2 +; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v2, 1.0 +; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, 1.0 +; SI-NEXT: v_rcp_f32_e32 v6, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; SI-NEXT: v_fma_f32 v4, v5, v4, v4 -; SI-NEXT: v_mul_f32_e32 v5, v2, v4 -; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 -; SI-NEXT: v_fma_f32 v5, v6, v4, v5 -; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; SI-NEXT: v_fma_f32 v6, v7, v6, v6 +; SI-NEXT: v_mul_f32_e32 v7, v4, v6 +; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 +; SI-NEXT: v_fma_f32 v7, v8, v6, v7 +; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 -; SI-NEXT: v_trunc_f32_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; SI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; SI-NEXT: s_cmp_lt_i32 s1, 12 +; SI-NEXT: s_cbranch_scc1 .LBB0_7 +; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: s_sub_i32 s1, s2, s3 +; SI-NEXT: s_add_i32 s1, s1, 11 +; SI-NEXT: .LBB0_5: ; %frem.loop_body +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mul_f32_e32 v3, v5, v4 +; SI-NEXT: v_rndne_f32_e32 v3, v3 +; SI-NEXT: v_fma_f32 v3, -v3, v2, v5 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; SI-NEXT: v_add_f32_e32 v6, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; SI-NEXT: v_ldexp_f32_e64 v3, v3, 11 +; SI-NEXT: s_add_i32 s1, s1, -11 +; SI-NEXT: s_cmp_gt_i32 s1, 11 +; SI-NEXT: s_cbranch_scc1 .LBB0_5 +; SI-NEXT: ; %bb.6: ; %Flow +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: .LBB0_7: ; %frem.loop_exit +; SI-NEXT: s_add_i32 s1, s1, -10 +; SI-NEXT: v_ldexp_f32_e64 v3, v3, s1 +; SI-NEXT: v_mul_f32_e32 v4, v3, v4 +; SI-NEXT: v_rndne_f32_e32 v4, v4 +; SI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; SI-NEXT: v_add_f32_e32 v2, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SI-NEXT: v_ldexp_f32_e64 v2, v2, s0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_bfi_b32 v4, s0, v2, v0 +; SI-NEXT: .LBB0_8: ; %Flow19 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: s_movk_i32 s0, 0x7c00 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, v0 +; SI-NEXT: s_or_b64 vcc, s[0:1], vcc +; SI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s6, s10 +; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s2, s6 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s0 -; CI-NEXT: s_mov_b32 s9, s1 -; CI-NEXT: s_mov_b32 s0, s2 -; CI-NEXT: s_mov_b32 s1, s3 -; CI-NEXT: s_mov_b32 s2, s10 -; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: s_mov_b32 s7, s11 -; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 +; CI-NEXT: s_mov_b32 s4, s10 +; CI-NEXT: s_mov_b32 s5, s11 +; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 +; CI-NEXT: s_brev_b32 s0, -2 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 -; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; CI-NEXT: v_rcp_f32_e32 v4, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; CI-NEXT: v_and_b32_e32 v4, 0x7fffffff, v0 +; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v1| +; CI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 +; CI-NEXT: s_and_b64 vcc, exec, s[2:3] +; CI-NEXT: s_cbranch_vccz .LBB0_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_bfi_b32 v5, s0, 0, v0 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v2 +; CI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; CI-NEXT: s_cbranch_execz .LBB0_3 +; CI-NEXT: s_branch .LBB0_8 +; CI-NEXT: .LBB0_2: +; CI-NEXT: ; implicit-def: $vgpr3 +; CI-NEXT: .LBB0_3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f32_e32 v3, v4 +; CI-NEXT: v_ldexp_f32_e64 v5, v3, 11 +; CI-NEXT: v_frexp_mant_f32_e32 v3, v2 +; CI-NEXT: v_ldexp_f32_e64 v3, v3, 1 +; CI-NEXT: v_div_scale_f32 v9, s[0:1], v3, v3, 1.0 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v2 +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v8 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 +; CI-NEXT: v_not_b32_e32 v4, v2 +; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 +; CI-NEXT: v_rcp_f32_e32 v10, v9 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; CI-NEXT: v_fma_f32 v4, v5, v4, v4 -; CI-NEXT: v_mul_f32_e32 v5, v2, v4 -; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 -; CI-NEXT: v_fma_f32 v5, v6, v4, v5 -; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; CI-NEXT: v_fma_f32 v10, v11, v10, v10 +; CI-NEXT: v_mul_f32_e32 v11, v6, v10 +; CI-NEXT: v_fma_f32 v12, -v9, v11, v6 +; CI-NEXT: v_fma_f32 v11, v12, v10, v11 +; CI-NEXT: v_fma_f32 v6, -v9, v11, v6 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; CI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v4 +; CI-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB0_7 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_sub_i32_e32 v4, vcc, v7, v8 +; CI-NEXT: v_add_i32_e32 v4, vcc, 11, v4 +; CI-NEXT: .LBB0_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v7, v5 +; CI-NEXT: v_mul_f32_e32 v5, v7, v6 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v5, -v5, v3, v7 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v8, v5, v3 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v4 +; CI-NEXT: v_ldexp_f32_e64 v5, v5, 11 +; CI-NEXT: s_cbranch_vccnz .LBB0_5 +; CI-NEXT: ; %bb.6: ; %Flow +; CI-NEXT: v_mov_b32_e32 v5, v7 +; CI-NEXT: .LBB0_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v4, vcc, -10, v4 +; CI-NEXT: v_ldexp_f32_e32 v4, v5, v4 +; CI-NEXT: v_mul_f32_e32 v5, v4, v6 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v4, -v5, v3, v4 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v3, v4, v3 +; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; CI-NEXT: v_ldexp_f32_e32 v2, v3, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: s_brev_b32 s0, -2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_bfi_b32 v3, s0, v2, v0 +; CI-NEXT: .LBB0_8: ; %Flow19 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: s_movk_i32 s0, 0x7c00 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; CI-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, v0 +; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; CI-NEXT: s_or_b64 vcc, s[0:1], vcc +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; CI-NEXT: s_endpgm @@ -99,60 +248,185 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_add_u32 s0, s4, 8 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: flat_load_ushort v4, v[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s2, s4, 8 +; VI-NEXT: s_addc_u32 s3, s5, 0 +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: flat_load_ushort v1, v[1:2] ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; VI-NEXT: v_cvt_f32_f16_e64 v4, |v0| ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; VI-NEXT: v_rcp_f32_e32 v6, v5 -; VI-NEXT: v_mul_f32_e32 v7, v3, v6 -; VI-NEXT: v_mad_f32 v8, -v5, v7, v3 -; VI-NEXT: v_mac_f32_e32 v7, v8, v6 -; VI-NEXT: v_mad_f32 v3, -v5, v7, v3 -; VI-NEXT: v_mul_f32_e32 v3, v3, v6 -; VI-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; VI-NEXT: v_add_f32_e32 v3, v3, v7 -; VI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; VI-NEXT: v_div_fixup_f16 v3, v3, v2, v4 -; VI-NEXT: v_trunc_f16_e32 v3, v3 -; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_cvt_f32_f16_e64 v2, |v1| +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v2 +; VI-NEXT: s_cbranch_vccz .LBB0_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: v_bfi_b32 v3, s2, 0, v0 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; VI-NEXT: s_cbranch_execz .LBB0_3 +; VI-NEXT: s_branch .LBB0_8 +; VI-NEXT: .LBB0_2: +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: .LBB0_3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f32_e32 v3, v4 +; VI-NEXT: v_ldexp_f32 v5, v3, 11 +; VI-NEXT: v_frexp_mant_f32_e32 v3, v2 +; VI-NEXT: v_ldexp_f32 v3, v3, 1 +; VI-NEXT: v_div_scale_f32 v9, s[2:3], v3, v3, 1.0 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v8, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v8 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 +; VI-NEXT: v_not_b32_e32 v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v7 +; VI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 +; VI-NEXT: v_rcp_f32_e32 v10, v9 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; VI-NEXT: v_fma_f32 v10, v11, v10, v10 +; VI-NEXT: v_mul_f32_e32 v11, v6, v10 +; VI-NEXT: v_fma_f32 v12, -v9, v11, v6 +; VI-NEXT: v_fma_f32 v11, v12, v10, v11 +; VI-NEXT: v_fma_f32 v6, -v9, v11, v6 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v4 +; VI-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB0_7 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_sub_u32_e32 v4, vcc, v7, v8 +; VI-NEXT: v_add_u32_e32 v4, vcc, 11, v4 +; VI-NEXT: .LBB0_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mul_f32_e32 v5, v7, v6 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v5, -v5, v3, v7 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; VI-NEXT: v_add_f32_e32 v8, v5, v3 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, -11, v4 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v4 +; VI-NEXT: v_ldexp_f32 v5, v5, 11 +; VI-NEXT: s_cbranch_vccnz .LBB0_5 +; VI-NEXT: ; %bb.6: ; %Flow +; VI-NEXT: v_mov_b32_e32 v5, v7 +; VI-NEXT: .LBB0_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v4, vcc, -10, v4 +; VI-NEXT: v_ldexp_f32 v4, v5, v4 +; VI-NEXT: v_mul_f32_e32 v5, v4, v6 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v4, -v5, v3, v4 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; VI-NEXT: v_ldexp_f32 v2, v3, v2 +; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: v_bfi_b32 v3, s2, v2, v0 +; VI-NEXT: .LBB0_8: ; %Flow19 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_movk_i32 s0, 0x7c00 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_cmp_lg_f16_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_nge_f16_e64 s[0:1], |v0|, s0 +; VI-NEXT: s_and_b64 vcc, s[0:1], vcc +; VI-NEXT: v_mov_b32_e32 v0, 0x7e00 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: frem_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 +; GFX9-NEXT: global_load_ushort v0, v2, s[2:3] +; GFX9-NEXT: global_load_ushort v1, v2, s[6:7] offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX9-NEXT: v_cvt_f32_f16_e64 v4, |v0| ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX9-NEXT: v_mac_f32_e32 v3, v5, v4 -; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX9-NEXT: v_cvt_f32_f16_e64 v2, |v1| +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v2 +; GFX9-NEXT: s_cbranch_vccz .LBB0_2 +; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: v_bfi_b32 v3, s2, 0, v0 +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GFX9-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-NEXT: s_branch .LBB0_8 +; GFX9-NEXT: .LBB0_2: +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: .LBB0_3: ; %frem.compute +; GFX9-NEXT: v_frexp_mant_f32_e32 v3, v4 +; GFX9-NEXT: v_ldexp_f32 v5, v3, 11 +; GFX9-NEXT: v_frexp_mant_f32_e32 v3, v2 +; GFX9-NEXT: v_ldexp_f32 v3, v3, 1 +; GFX9-NEXT: v_div_scale_f32 v9, s[2:3], v3, v3, 1.0 +; GFX9-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v8, v2 +; GFX9-NEXT: v_add_u32_e32 v2, -1, v8 +; GFX9-NEXT: v_not_b32_e32 v4, v2 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 +; GFX9-NEXT: v_rcp_f32_e32 v10, v9 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; GFX9-NEXT: v_fma_f32 v10, v11, v10, v10 +; GFX9-NEXT: v_mul_f32_e32 v11, v6, v10 +; GFX9-NEXT: v_fma_f32 v12, -v9, v11, v6 +; GFX9-NEXT: v_fma_f32 v11, v12, v10, v11 +; GFX9-NEXT: v_fma_f32 v6, -v9, v11, v6 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX9-NEXT: v_div_fmas_f32 v6, v6, v10, v11 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v4 +; GFX9-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB0_7 +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: v_sub_u32_e32 v4, v7, v8 +; GFX9-NEXT: v_add_u32_e32 v4, 11, v4 +; GFX9-NEXT: .LBB0_5: ; %frem.loop_body +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mul_f32_e32 v5, v7, v6 +; GFX9-NEXT: v_rndne_f32_e32 v5, v5 +; GFX9-NEXT: v_fma_f32 v5, -v5, v3, v7 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; GFX9-NEXT: v_add_f32_e32 v8, v5, v3 +; GFX9-NEXT: v_add_u32_e32 v4, -11, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 11, v4 +; GFX9-NEXT: v_ldexp_f32 v5, v5, 11 +; GFX9-NEXT: s_cbranch_vccnz .LBB0_5 +; GFX9-NEXT: ; %bb.6: ; %Flow +; GFX9-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-NEXT: .LBB0_7: ; %frem.loop_exit +; GFX9-NEXT: v_add_u32_e32 v4, -10, v4 +; GFX9-NEXT: v_ldexp_f32 v4, v5, v4 +; GFX9-NEXT: v_mul_f32_e32 v5, v4, v6 +; GFX9-NEXT: v_rndne_f32_e32 v5, v5 +; GFX9-NEXT: v_fma_f32 v4, -v5, v3, v4 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 ; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 -; GFX9-NEXT: v_trunc_f16_e32 v3, v3 -; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX9-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: v_bfi_b32 v3, s2, v2, v0 +; GFX9-NEXT: .LBB0_8: ; %Flow19 +; GFX9-NEXT: s_movk_i32 s2, 0x7c00 +; GFX9-NEXT: v_cmp_lg_f16_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_nge_f16_e64 s[2:3], |v0|, s2 +; GFX9-NEXT: s_and_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7e00 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: global_store_short v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: frem_f16: @@ -160,28 +434,90 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 +; GFX10-NEXT: global_load_ushort v0, v2, s[2:3] +; GFX10-NEXT: global_load_ushort v1, v2, s[6:7] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_e64 v3, |v0| ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-NEXT: v_rcp_f32_e32 v5, v4 -; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 -; GFX10-NEXT: v_mad_f32 v7, -v4, v6, v3 -; GFX10-NEXT: v_mac_f32_e32 v6, v7, v5 -; GFX10-NEXT: v_mad_f32 v3, -v4, v6, v3 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 -; GFX10-NEXT: v_trunc_f16_e32 v3, v3 -; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: v_cvt_f32_f16_e64 v2, |v1| +; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: s_cbranch_vccz .LBB0_2 +; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: v_bfi_b32 v4, 0x7fff, 0, v0 +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB0_3 +; GFX10-NEXT: s_branch .LBB0_8 +; GFX10-NEXT: .LBB0_2: +; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: .LBB0_3: ; %frem.compute +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v5, v3 +; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s2, v5 +; GFX10-NEXT: v_ldexp_f32 v4, v3, 11 +; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v2 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v2, v2 +; GFX10-NEXT: v_ldexp_f32 v3, v3, 1 +; GFX10-NEXT: v_readfirstlane_b32 s3, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX10-NEXT: v_div_scale_f32 v7, s4, v3, v3, 1.0 +; GFX10-NEXT: v_not_b32_e32 v6, v2 +; GFX10-NEXT: v_rcp_f32_e32 v8, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v8, v9, v8 +; GFX10-NEXT: v_mul_f32_e32 v9, v5, v8 +; GFX10-NEXT: v_fma_f32 v10, -v7, v9, v5 +; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX10-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX10-NEXT: s_denorm_mode 12 +; GFX10-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 +; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX10-NEXT: s_cbranch_vccnz .LBB0_7 +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s2, s2, 11 +; GFX10-NEXT: .LBB0_5: ; %frem.loop_body +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v7, v4 +; GFX10-NEXT: s_add_i32 s2, s2, -11 +; GFX10-NEXT: s_cmp_gt_i32 s2, 11 +; GFX10-NEXT: v_mul_f32_e32 v4, v7, v5 +; GFX10-NEXT: v_rndne_f32_e32 v4, v4 +; GFX10-NEXT: v_fma_f32 v4, -v4, v3, v7 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v4, v4, 11 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_5 +; GFX10-NEXT: ; %bb.6: ; %Flow +; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-NEXT: .LBB0_7: ; %frem.loop_exit +; GFX10-NEXT: v_add_nc_u32_e32 v6, -10, v6 +; GFX10-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX10-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX10-NEXT: v_rndne_f32_e32 v5, v5 +; GFX10-NEXT: v_fma_f32 v4, -v5, v3, v4 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_bfi_b32 v4, 0x7fff, v2, v0 +; GFX10-NEXT: .LBB0_8: ; %Flow19 +; GFX10-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v0| +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo +; GFX10-NEXT: global_store_short v2, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: frem_f16: @@ -189,36 +525,111 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v0, s[2:3] +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[4:5] offset:8 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v3, |v1.l| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v2, |v0.l| +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB0_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, 0, v1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.l, v4.l, vcc_lo +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB0_3 +; GFX11-TRUE16-NEXT: s_branch .LBB0_8 +; GFX11-TRUE16-NEXT: .LBB0_2: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: .LBB0_3: ; %frem.compute +; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, v3 +; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v5 +; GFX11-TRUE16-NEXT: v_ldexp_f32 v4, v3, 11 +; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_ldexp_f32 v3, v3, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_div_scale_f32 v7, null, v3, v3, 1.0 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX11-TRUE16-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 +; GFX11-TRUE16-NEXT: s_denorm_mode 15 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fma_f32 v9, -v7, v8, 1.0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v8, v9, v8 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v9, v5, v8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX11-TRUE16-NEXT: v_fma_f32 v10, -v7, v9, v5 +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX11-TRUE16-NEXT: s_denorm_mode 12 +; GFX11-TRUE16-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB0_7 +; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 +; GFX11-TRUE16-NEXT: .LBB0_5: ; %frem.loop_body +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v4 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, -11 +; GFX11-TRUE16-NEXT: s_cmp_gt_i32 s2, 11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v7, v5 +; GFX11-TRUE16-NEXT: v_rndne_f32_e32 v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v4, -v4, v3, v7 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_ldexp_f32 v4, v4, 11 +; GFX11-TRUE16-NEXT: s_cbranch_scc1 .LBB0_5 +; GFX11-TRUE16-NEXT: ; %bb.6: ; %Flow +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v7 +; GFX11-TRUE16-NEXT: .LBB0_7: ; %frem.loop_exit +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 +; GFX11-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX11-TRUE16-NEXT: v_rndne_f32_e32 v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v4, -v5, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v2, v1 +; GFX11-TRUE16-NEXT: .LBB0_8: ; %Flow19 +; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v0.l +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v1.l| +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v4.l, s2 ; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -227,187 +638,604 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX11-FAKE16-NEXT: global_load_u16 v0, v1, s[2:3] +; GFX11-FAKE16-NEXT: global_load_u16 v1, v1, s[4:5] offset:8 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v3, |v0| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v2, |v1| +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB0_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX11-FAKE16-NEXT: v_bfi_b32 v4, 0x7fff, 0, v0 +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc_lo +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB0_3 +; GFX11-FAKE16-NEXT: s_branch .LBB0_8 +; GFX11-FAKE16-NEXT: .LBB0_2: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-FAKE16-NEXT: .LBB0_3: ; %frem.compute +; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, v3 +; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s2, v5 +; GFX11-FAKE16-NEXT: v_ldexp_f32 v4, v3, 11 +; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_ldexp_f32 v3, v3, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_div_scale_f32 v7, null, v3, v3, 1.0 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v8, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 +; GFX11-FAKE16-NEXT: s_denorm_mode 15 ; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX11-FAKE16-NEXT: v_fma_f32 v9, -v7, v8, 1.0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v8, v9, v8 +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v9, v5, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX11-FAKE16-NEXT: v_fma_f32 v10, -v7, v9, v5 +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX11-FAKE16-NEXT: s_denorm_mode 12 +; GFX11-FAKE16-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB0_7 +; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 +; GFX11-FAKE16-NEXT: .LBB0_5: ; %frem.loop_body +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v4 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, -11 +; GFX11-FAKE16-NEXT: s_cmp_gt_i32 s2, 11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v4, v7, v5 +; GFX11-FAKE16-NEXT: v_rndne_f32_e32 v4, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v4, -v4, v3, v7 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_ldexp_f32 v4, v4, 11 +; GFX11-FAKE16-NEXT: s_cbranch_scc1 .LBB0_5 +; GFX11-FAKE16-NEXT: ; %bb.6: ; %Flow +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, s2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v7 +; GFX11-FAKE16-NEXT: .LBB0_7: ; %frem.loop_exit +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 +; GFX11-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX11-FAKE16-NEXT: v_rndne_f32_e32 v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v4, -v5, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 -; GFX11-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_fma_f16 v1, -v3, v2, v1 -; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_bfi_b32 v4, 0x7fff, v2, v0 +; GFX11-FAKE16-NEXT: .LBB0_8: ; %Flow19 +; GFX11-FAKE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v1 +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v0| +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-FAKE16-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX1150-TRUE16-LABEL: frem_f16: ; GFX1150-TRUE16: ; %bb.0: ; GFX1150-TRUE16-NEXT: s_clause 0x1 -; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1150-TRUE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1150-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-TRUE16-NEXT: s_clause 0x1 -; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[10:11] +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v1, s[0:1] offset:8 ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1150-TRUE16-NEXT: s_and_b32 s0, s0, 0x7fff +; GFX1150-TRUE16-NEXT: s_and_b32 s2, s1, 0x7fff +; GFX1150-TRUE16-NEXT: s_cvt_f32_f16 s1, s0 +; GFX1150-TRUE16-NEXT: s_cvt_f32_f16 s0, s2 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s1, s0 +; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB0_2 +; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s1, s0 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, v0 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.l, v2.l, s3 +; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB0_3 +; GFX1150-TRUE16-NEXT: s_branch .LBB0_8 +; GFX1150-TRUE16-NEXT: .LBB0_2: +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX1150-TRUE16-NEXT: .LBB0_3: ; %frem.compute +; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s0 +; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s1 +; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s1 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v3, 1 +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v4, v2, 11 +; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v2, s0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1150-TRUE16-NEXT: v_div_scale_f32 v7, null, v3, v3, 1.0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v8, v7 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1150-TRUE16-NEXT: v_not_b32_e32 v6, v2 +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX1150-TRUE16-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 +; GFX1150-TRUE16-NEXT: s_denorm_mode 15 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v8, v9, v8 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v9, v5, v8 +; GFX1150-TRUE16-NEXT: v_fma_f32 v10, -v7, v9, v5 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX1150-TRUE16-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX1150-TRUE16-NEXT: s_denorm_mode 12 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-TRUE16-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 +; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB0_7 +; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-TRUE16-NEXT: s_sub_i32 s0, s1, s0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: s_add_i32 s0, s0, 11 +; GFX1150-TRUE16-NEXT: .LBB0_5: ; %frem.loop_body +; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v7, v4 +; GFX1150-TRUE16-NEXT: s_add_i32 s0, s0, -11 +; GFX1150-TRUE16-NEXT: s_cmp_gt_i32 s0, 11 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v7, v5 +; GFX1150-TRUE16-NEXT: v_rndne_f32_e32 v4, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1150-TRUE16-NEXT: v_fma_f32 v4, v4, v3, v7 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l -; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v4, v4, 11 +; GFX1150-TRUE16-NEXT: s_cbranch_scc1 .LBB0_5 +; GFX1150-TRUE16-NEXT: ; %bb.6: ; %Flow +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v6, s0 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, v7 +; GFX1150-TRUE16-NEXT: .LBB0_7: ; %frem.loop_exit +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX1150-TRUE16-NEXT: v_rndne_f32_e32 v5, v5 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v4, v5, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v0 +; GFX1150-TRUE16-NEXT: .LBB0_8: ; %Flow19 +; GFX1150-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x7fff, v0 +; GFX1150-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v1.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, 0x7c00, v0.l +; GFX1150-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, s0 +; GFX1150-TRUE16-NEXT: global_store_b16 v3, v0, s[8:9] ; GFX1150-TRUE16-NEXT: s_endpgm ; ; GFX1150-FAKE16-LABEL: frem_f16: ; GFX1150-FAKE16: ; %bb.0: ; GFX1150-FAKE16-NEXT: s_clause 0x1 -; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1150-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-FAKE16-NEXT: s_clause 0x1 -; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX1150-FAKE16-NEXT: global_load_u16 v0, v1, s[10:11] +; GFX1150-FAKE16-NEXT: global_load_u16 v1, v1, s[0:1] offset:8 ; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1150-FAKE16-NEXT: s_and_b32 s0, s0, 0x7fff +; GFX1150-FAKE16-NEXT: s_and_b32 s2, s1, 0x7fff +; GFX1150-FAKE16-NEXT: s_cvt_f32_f16 s1, s0 +; GFX1150-FAKE16-NEXT: s_cvt_f32_f16 s0, s2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s1, s0 +; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB0_2 +; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s1, s0 +; GFX1150-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, v0 +; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo +; GFX1150-FAKE16-NEXT: s_cbranch_execz .LBB0_3 +; GFX1150-FAKE16-NEXT: s_branch .LBB0_8 +; GFX1150-FAKE16-NEXT: .LBB0_2: +; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr2 +; GFX1150-FAKE16-NEXT: .LBB0_3: ; %frem.compute +; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s0 +; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s1 +; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v3, v3, 1 +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v4, v2, 11 +; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v2, s0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1150-FAKE16-NEXT: v_div_scale_f32 v7, null, v3, v3, 1.0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v8, v7 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX1150-FAKE16-NEXT: v_not_b32_e32 v6, v2 +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX1150-FAKE16-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 +; GFX1150-FAKE16-NEXT: s_denorm_mode 15 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v8, v9, v8 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v9, v5, v8 +; GFX1150-FAKE16-NEXT: v_fma_f32 v10, -v7, v9, v5 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1150-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX1150-FAKE16-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX1150-FAKE16-NEXT: s_denorm_mode 12 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 +; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB0_7 +; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-FAKE16-NEXT: s_sub_i32 s0, s1, s0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-FAKE16-NEXT: s_add_i32 s0, s0, 11 +; GFX1150-FAKE16-NEXT: .LBB0_5: ; %frem.loop_body +; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v7, v4 +; GFX1150-FAKE16-NEXT: s_add_i32 s0, s0, -11 +; GFX1150-FAKE16-NEXT: s_cmp_gt_i32 s0, 11 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v7, v5 +; GFX1150-FAKE16-NEXT: v_rndne_f32_e32 v4, v4 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 -; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1150-FAKE16-NEXT: v_fma_f32 v4, v4, v3, v7 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 -; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v4, v4, 11 +; GFX1150-FAKE16-NEXT: s_cbranch_scc1 .LBB0_5 +; GFX1150-FAKE16-NEXT: ; %bb.6: ; %Flow +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v6, s0 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, v7 +; GFX1150-FAKE16-NEXT: .LBB0_7: ; %frem.loop_exit +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX1150-FAKE16-NEXT: v_rndne_f32_e32 v5, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v5, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v0 +; GFX1150-FAKE16-NEXT: .LBB0_8: ; %Flow19 +; GFX1150-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x7fff, v0 +; GFX1150-FAKE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_cmp_nle_f16_e64 s0, 0x7c00, v0 +; GFX1150-FAKE16-NEXT: s_and_b32 vcc_lo, s0, vcc_lo +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX1150-FAKE16-NEXT: global_store_b16 v3, v0, s[8:9] ; GFX1150-FAKE16-NEXT: s_endpgm ; ; GFX1200-TRUE16-LABEL: frem_f16: ; GFX1200-TRUE16: ; %bb.0: ; GFX1200-TRUE16-NEXT: s_clause 0x1 -; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1200-TRUE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1200-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1200-TRUE16-NEXT: s_clause 0x1 -; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[10:11] +; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v1, s[0:1] offset:8 ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1200-TRUE16-NEXT: s_and_b32 s0, s0, 0x7fff +; GFX1200-TRUE16-NEXT: s_and_b32 s2, s1, 0x7fff +; GFX1200-TRUE16-NEXT: s_cvt_f32_f16 s1, s0 +; GFX1200-TRUE16-NEXT: s_cvt_f32_f16 s0, s2 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s1, s0 +; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB0_2 +; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s1, s0 +; GFX1200-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, v0 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.l, v2.l, s3 +; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB0_3 +; GFX1200-TRUE16-NEXT: s_branch .LBB0_8 +; GFX1200-TRUE16-NEXT: .LBB0_2: +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX1200-TRUE16-NEXT: .LBB0_3: ; %frem.compute +; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s0 +; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s1 +; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s1 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v3, 1 +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v4, v2, 11 +; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v2, s0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1200-TRUE16-NEXT: v_div_scale_f32 v7, null, v3, v3, 1.0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v8, v7 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1200-TRUE16-NEXT: v_not_b32_e32 v6, v2 +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX1200-TRUE16-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 +; GFX1200-TRUE16-NEXT: s_denorm_mode 15 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v8, v9, v8 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v9, v5, v8 +; GFX1200-TRUE16-NEXT: v_fma_f32 v10, -v7, v9, v5 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX1200-TRUE16-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX1200-TRUE16-NEXT: s_denorm_mode 12 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-TRUE16-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 +; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB0_7 +; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-TRUE16-NEXT: s_sub_co_i32 s0, s1, s0 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_add_co_i32 s0, s0, 11 +; GFX1200-TRUE16-NEXT: .LBB0_5: ; %frem.loop_body +; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v7, v4 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_add_co_i32 s0, s0, -11 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_cmp_gt_i32 s0, 11 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v7, v5 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l +; GFX1200-TRUE16-NEXT: v_rndne_f32_e32 v4, v4 +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1200-TRUE16-NEXT: v_fma_f32 v4, v4, v3, v7 +; GFX1200-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v4, v4, 11 +; GFX1200-TRUE16-NEXT: s_cbranch_scc1 .LBB0_5 +; GFX1200-TRUE16-NEXT: ; %bb.6: ; %Flow +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v6, s0 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v4, v7 +; GFX1200-TRUE16-NEXT: .LBB0_7: ; %frem.loop_exit +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX1200-TRUE16-NEXT: v_rndne_f32_e32 v5, v5 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v4, v5, v3 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l -; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1200-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v0 +; GFX1200-TRUE16-NEXT: .LBB0_8: ; %Flow19 +; GFX1200-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x7fff, v0 +; GFX1200-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v1.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1200-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, 0x7c00, v0.l +; GFX1200-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, s0 +; GFX1200-TRUE16-NEXT: global_store_b16 v3, v0, s[8:9] ; GFX1200-TRUE16-NEXT: s_endpgm ; ; GFX1200-FAKE16-LABEL: frem_f16: ; GFX1200-FAKE16: ; %bb.0: ; GFX1200-FAKE16-NEXT: s_clause 0x1 -; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1200-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1200-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1200-FAKE16-NEXT: s_clause 0x1 -; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX1200-FAKE16-NEXT: global_load_u16 v0, v1, s[10:11] +; GFX1200-FAKE16-NEXT: global_load_u16 v1, v1, s[0:1] offset:8 ; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1 -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1200-FAKE16-NEXT: s_and_b32 s0, s0, 0x7fff +; GFX1200-FAKE16-NEXT: s_and_b32 s2, s1, 0x7fff +; GFX1200-FAKE16-NEXT: s_cvt_f32_f16 s1, s0 +; GFX1200-FAKE16-NEXT: s_cvt_f32_f16 s0, s2 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s1, s0 +; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB0_2 +; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s1, s0 +; GFX1200-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, v0 +; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo +; GFX1200-FAKE16-NEXT: s_cbranch_execz .LBB0_3 +; GFX1200-FAKE16-NEXT: s_branch .LBB0_8 +; GFX1200-FAKE16-NEXT: .LBB0_2: +; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr2 +; GFX1200-FAKE16-NEXT: .LBB0_3: ; %frem.compute +; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s0 +; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s1 +; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v3, v3, 1 +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v4, v2, 11 +; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v2, s0 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1200-FAKE16-NEXT: v_div_scale_f32 v7, null, v3, v3, 1.0 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v8, v7 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX1200-FAKE16-NEXT: v_not_b32_e32 v6, v2 +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX1200-FAKE16-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 +; GFX1200-FAKE16-NEXT: s_denorm_mode 15 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v8, v9, v8 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v9, v5, v8 +; GFX1200-FAKE16-NEXT: v_fma_f32 v10, -v7, v9, v5 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1200-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX1200-FAKE16-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX1200-FAKE16-NEXT: s_denorm_mode 12 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-FAKE16-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 +; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB0_7 +; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-FAKE16-NEXT: s_sub_co_i32 s0, s1, s0 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_add_co_i32 s0, s0, 11 +; GFX1200-FAKE16-NEXT: .LBB0_5: ; %frem.loop_body +; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v7, v4 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_add_co_i32 s0, s0, -11 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_cmp_gt_i32 s0, 11 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v7, v5 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX1200-FAKE16-NEXT: v_rndne_f32_e32 v4, v4 +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 -; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1200-FAKE16-NEXT: v_fma_f32 v4, v4, v3, v7 +; GFX1200-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v4, v4, 11 +; GFX1200-FAKE16-NEXT: s_cbranch_scc1 .LBB0_5 +; GFX1200-FAKE16-NEXT: ; %bb.6: ; %Flow +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v6, s0 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, v7 +; GFX1200-FAKE16-NEXT: .LBB0_7: ; %frem.loop_exit +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX1200-FAKE16-NEXT: v_rndne_f32_e32 v5, v5 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v4, v5, v3 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 -; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1200-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v0 +; GFX1200-FAKE16-NEXT: .LBB0_8: ; %Flow19 +; GFX1200-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x7fff, v0 +; GFX1200-FAKE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1200-FAKE16-NEXT: v_cmp_nle_f16_e64 s0, 0x7c00, v0 +; GFX1200-FAKE16-NEXT: s_and_b32 vcc_lo, s0, vcc_lo +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX1200-FAKE16-NEXT: global_store_b16 v3, v0, s[8:9] ; GFX1200-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 @@ -440,10 +1268,22 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_rcp_f32_e32 v2, v1 -; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 +; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 +; SI-NEXT: v_rcp_f32_e32 v4, v3 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; SI-NEXT: v_fma_f32 v4, v5, v4, v4 +; SI-NEXT: v_mul_f32_e32 v5, v2, v4 +; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 +; SI-NEXT: v_fma_f32 v5, v6, v4, v5 +; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -455,24 +1295,36 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s6, s10 -; CI-NEXT: s_mov_b32 s7, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; CI-NEXT: s_mov_b32 s8, s0 ; CI-NEXT: s_mov_b32 s9, s1 ; CI-NEXT: s_mov_b32 s0, s2 ; CI-NEXT: s_mov_b32 s1, s3 ; CI-NEXT: s_mov_b32 s2, s10 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: s_mov_b32 s7, s11 ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_rcp_f32_e32 v2, v1 -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v2, v0, v2 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 +; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 +; CI-NEXT: v_rcp_f32_e32 v4, v3 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; CI-NEXT: v_fma_f32 v4, v5, v4, v4 +; CI-NEXT: v_mul_f32_e32 v5, v2, v4 +; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 +; CI-NEXT: v_fma_f32 v5, v6, v4, v5 +; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; CI-NEXT: s_endpgm @@ -492,9 +1344,20 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_rcp_f16_e32 v3, v2 -; VI-NEXT: v_mul_f16_e32 v3, v4, v3 +; VI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; VI-NEXT: v_rcp_f32_e32 v6, v5 +; VI-NEXT: v_mul_f32_e32 v7, v3, v6 +; VI-NEXT: v_mad_f32 v8, -v5, v7, v3 +; VI-NEXT: v_mac_f32_e32 v7, v8, v6 +; VI-NEXT: v_mad_f32 v3, -v5, v7, v3 +; VI-NEXT: v_mul_f32_e32 v3, v3, v6 +; VI-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; VI-NEXT: v_add_f32_e32 v3, v3, v7 +; VI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; VI-NEXT: v_div_fixup_f16 v3, v3, v2, v4 ; VI-NEXT: v_trunc_f16_e32 v3, v3 ; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -508,9 +1371,20 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v3, v2 -; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mac_f32_e32 v3, v5, v4 +; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3 ; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] @@ -526,9 +1400,20 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_rcp_f16_e32 v3, v2 -; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX10-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX10-NEXT: v_mad_f32 v7, -v4, v6, v3 +; GFX10-NEXT: v_mac_f32_e32 v6, v7, v5 +; GFX10-NEXT: v_mad_f32 v3, -v4, v6, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 ; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] @@ -543,14 +1428,32 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8 +; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l -; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v1.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v0.l ; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -564,12 +1467,28 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX11-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm @@ -583,15 +1502,33 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-TRUE16-NEXT: s_clause 0x1 ; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX1150-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8 +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l ; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX1150-TRUE16-NEXT: s_endpgm ; @@ -605,13 +1542,29 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-FAKE16-NEXT: s_clause 0x1 ; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-FAKE16-NEXT: v_rcp_f16_e32 v3, v2 -; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3 -; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 ; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 ; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX1150-FAKE16-NEXT: s_endpgm @@ -625,15 +1578,33 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1200-TRUE16-NEXT: s_clause 0x1 ; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX1200-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8 +; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1 +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l ; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX1200-TRUE16-NEXT: s_endpgm ; @@ -647,13 +1618,29 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1200-FAKE16-NEXT: s_clause 0x1 ; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1 +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-FAKE16-NEXT: v_rcp_f16_e32 v3, v2 -; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3 -; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 ; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 ; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX1200-FAKE16-NEXT: s_endpgm @@ -688,10 +1675,22 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_rcp_f32_e32 v2, v1 -; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 +; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 +; SI-NEXT: v_rcp_f32_e32 v4, v3 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; SI-NEXT: v_fma_f32 v4, v5, v4, v4 +; SI-NEXT: v_mul_f32_e32 v5, v2, v4 +; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 +; SI-NEXT: v_fma_f32 v5, v6, v4, v5 +; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -703,24 +1702,36 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s6, s10 -; CI-NEXT: s_mov_b32 s7, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; CI-NEXT: s_mov_b32 s8, s0 ; CI-NEXT: s_mov_b32 s9, s1 ; CI-NEXT: s_mov_b32 s0, s2 ; CI-NEXT: s_mov_b32 s1, s3 ; CI-NEXT: s_mov_b32 s2, s10 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: s_mov_b32 s7, s11 ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_rcp_f32_e32 v2, v1 -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v2, v0, v2 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 +; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 +; CI-NEXT: v_rcp_f32_e32 v4, v3 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; CI-NEXT: v_fma_f32 v4, v5, v4, v4 +; CI-NEXT: v_mul_f32_e32 v5, v2, v4 +; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 +; CI-NEXT: v_fma_f32 v5, v6, v4, v5 +; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; CI-NEXT: s_endpgm @@ -740,9 +1751,20 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_rcp_f16_e32 v3, v2 -; VI-NEXT: v_mul_f16_e32 v3, v4, v3 +; VI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; VI-NEXT: v_rcp_f32_e32 v6, v5 +; VI-NEXT: v_mul_f32_e32 v7, v3, v6 +; VI-NEXT: v_mad_f32 v8, -v5, v7, v3 +; VI-NEXT: v_mac_f32_e32 v7, v8, v6 +; VI-NEXT: v_mad_f32 v3, -v5, v7, v3 +; VI-NEXT: v_mul_f32_e32 v3, v3, v6 +; VI-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; VI-NEXT: v_add_f32_e32 v3, v3, v7 +; VI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; VI-NEXT: v_div_fixup_f16 v3, v3, v2, v4 ; VI-NEXT: v_trunc_f16_e32 v3, v3 ; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -756,9 +1778,20 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v3, v2 -; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mac_f32_e32 v3, v5, v4 +; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3 ; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] @@ -774,9 +1807,20 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_rcp_f16_e32 v3, v2 -; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX10-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX10-NEXT: v_mad_f32 v7, -v4, v6, v3 +; GFX10-NEXT: v_mac_f32_e32 v6, v7, v5 +; GFX10-NEXT: v_mad_f32 v3, -v4, v6, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 ; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] @@ -791,14 +1835,32 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8 +; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l -; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v1.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v0.l ; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -812,12 +1874,28 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX11-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm @@ -831,15 +1909,33 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-TRUE16-NEXT: s_clause 0x1 ; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX1150-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8 +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l ; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX1150-TRUE16-NEXT: s_endpgm ; @@ -853,13 +1949,29 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-FAKE16-NEXT: s_clause 0x1 ; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-FAKE16-NEXT: v_rcp_f16_e32 v3, v2 -; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3 -; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 ; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 ; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX1150-FAKE16-NEXT: s_endpgm @@ -873,15 +1985,33 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1200-TRUE16-NEXT: s_clause 0x1 ; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX1200-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8 +; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1 +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l ; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX1200-TRUE16-NEXT: s_endpgm ; @@ -895,13 +2025,29 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1200-FAKE16-NEXT: s_clause 0x1 ; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1 +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-FAKE16-NEXT: v_rcp_f16_e32 v3, v2 -; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3 -; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 ; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 ; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX1200-FAKE16-NEXT: s_endpgm @@ -922,6 +2068,819 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v1| +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccz .LBB3_2 +; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_bfi_b32 v2, s2, 0, v0 +; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v1| +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB3_3 +; SI-NEXT: s_branch .LBB3_8 +; SI-NEXT: .LBB3_2: +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB3_3: ; %frem.compute +; SI-NEXT: s_mov_b32 s6, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v0|, s6 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 +; SI-NEXT: s_and_b64 s[4:5], s[2:3], exec +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: s_cselect_b32 s4, s4, 0 +; SI-NEXT: v_frexp_mant_f32_e64 v2, |v0| +; SI-NEXT: v_cndmask_b32_e64 v2, |v0|, v2, s[2:3] +; SI-NEXT: v_ldexp_f32_e64 v3, v2, 12 +; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, s6 +; SI-NEXT: v_frexp_mant_f32_e64 v2, |v1| +; SI-NEXT: v_cndmask_b32_e64 v2, |v1|, v2, s[2:3] +; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v1 +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-NEXT: v_readfirstlane_b32 s2, v4 +; SI-NEXT: s_cselect_b32 s5, s2, 0 +; SI-NEXT: s_add_i32 s2, s5, -1 +; SI-NEXT: v_ldexp_f32_e64 v2, v2, 1 +; SI-NEXT: s_not_b32 s3, s2 +; SI-NEXT: s_add_i32 s3, s3, s4 +; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v2, 1.0 +; SI-NEXT: v_div_scale_f32 v5, s[6:7], v2, v2, 1.0 +; SI-NEXT: v_rcp_f32_e32 v6, v5 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; SI-NEXT: v_fma_f32 v6, v7, v6, v6 +; SI-NEXT: v_mul_f32_e32 v7, v4, v6 +; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 +; SI-NEXT: v_fma_f32 v7, v8, v6, v7 +; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; SI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; SI-NEXT: s_cmp_lt_i32 s3, 13 +; SI-NEXT: s_cbranch_scc1 .LBB3_7 +; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: s_sub_i32 s3, s4, s5 +; SI-NEXT: s_add_i32 s3, s3, 12 +; SI-NEXT: .LBB3_5: ; %frem.loop_body +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mul_f32_e32 v3, v5, v4 +; SI-NEXT: v_rndne_f32_e32 v3, v3 +; SI-NEXT: v_fma_f32 v3, -v3, v2, v5 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; SI-NEXT: v_add_f32_e32 v6, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; SI-NEXT: v_ldexp_f32_e64 v3, v3, 12 +; SI-NEXT: s_add_i32 s3, s3, -12 +; SI-NEXT: s_cmp_gt_i32 s3, 12 +; SI-NEXT: s_cbranch_scc1 .LBB3_5 +; SI-NEXT: ; %bb.6: ; %Flow +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: .LBB3_7: ; %frem.loop_exit +; SI-NEXT: s_add_i32 s3, s3, -11 +; SI-NEXT: v_ldexp_f32_e64 v3, v3, s3 +; SI-NEXT: v_mul_f32_e32 v4, v3, v4 +; SI-NEXT: v_rndne_f32_e32 v4, v4 +; SI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; SI-NEXT: v_add_f32_e32 v2, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SI-NEXT: v_ldexp_f32_e64 v2, v2, s2 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_bfi_b32 v2, s2, v2, v0 +; SI-NEXT: .LBB3_8: ; %Flow17 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 +; SI-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-NEXT: v_cmp_nge_f32_e64 s[4:5], |v0|, s4 +; SI-NEXT: s_and_b64 vcc, s[4:5], vcc +; SI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; CI-LABEL: frem_f32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_mov_b32 s6, s10 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; CI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v1| +; CI-NEXT: s_and_b64 vcc, exec, s[2:3] +; CI-NEXT: s_cbranch_vccz .LBB3_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_bfi_b32 v2, s2, 0, v0 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v1| +; CI-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; CI-NEXT: s_cbranch_execz .LBB3_3 +; CI-NEXT: s_branch .LBB3_8 +; CI-NEXT: .LBB3_2: +; CI-NEXT: ; implicit-def: $vgpr2 +; CI-NEXT: .LBB3_3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f32_e64 v3, |v1| +; CI-NEXT: v_ldexp_f32_e64 v3, v3, 1 +; CI-NEXT: v_div_scale_f32 v9, s[2:3], v3, v3, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v2, |v0| +; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v1 +; CI-NEXT: v_ldexp_f32_e64 v5, v2, 12 +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v8 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v0 +; CI-NEXT: v_not_b32_e32 v4, v2 +; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 +; CI-NEXT: v_rcp_f32_e32 v10, v9 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; CI-NEXT: v_fma_f32 v10, v11, v10, v10 +; CI-NEXT: v_mul_f32_e32 v11, v6, v10 +; CI-NEXT: v_fma_f32 v12, -v9, v11, v6 +; CI-NEXT: v_fma_f32 v11, v12, v10, v11 +; CI-NEXT: v_fma_f32 v6, -v9, v11, v6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; CI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v4 +; CI-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB3_7 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_sub_i32_e32 v4, vcc, v7, v8 +; CI-NEXT: v_add_i32_e32 v4, vcc, 12, v4 +; CI-NEXT: .LBB3_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v7, v5 +; CI-NEXT: v_mul_f32_e32 v5, v7, v6 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v5, -v5, v3, v7 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v8, v5, v3 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, -12, v4 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v4 +; CI-NEXT: v_ldexp_f32_e64 v5, v5, 12 +; CI-NEXT: s_cbranch_vccnz .LBB3_5 +; CI-NEXT: ; %bb.6: ; %Flow +; CI-NEXT: v_mov_b32_e32 v5, v7 +; CI-NEXT: .LBB3_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4 +; CI-NEXT: v_ldexp_f32_e32 v4, v5, v4 +; CI-NEXT: v_mul_f32_e32 v5, v4, v6 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v4, -v5, v3, v4 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v3, v4, v3 +; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; CI-NEXT: v_ldexp_f32_e32 v2, v3, v2 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_bfi_b32 v2, s2, v2, v0 +; CI-NEXT: .LBB3_8: ; %Flow17 +; CI-NEXT: s_mov_b32 s4, 0x7f800000 +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 +; CI-NEXT: v_cmp_nge_f32_e64 s[4:5], |v0|, s4 +; CI-NEXT: s_and_b64 vcc, s[4:5], vcc +; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: frem_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s2, s4, 16 +; VI-NEXT: s_addc_u32 s3, s5, 0 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: flat_load_dword v1, v[1:2] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v1| +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] +; VI-NEXT: s_cbranch_vccz .LBB3_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_bfi_b32 v2, s2, 0, v0 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v1| +; VI-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; VI-NEXT: s_cbranch_execz .LBB3_3 +; VI-NEXT: s_branch .LBB3_8 +; VI-NEXT: .LBB3_2: +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: .LBB3_3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f32_e64 v3, |v1| +; VI-NEXT: v_ldexp_f32 v3, v3, 1 +; VI-NEXT: v_div_scale_f32 v9, s[2:3], v3, v3, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v2, |v0| +; VI-NEXT: v_frexp_exp_i32_f32_e32 v8, v1 +; VI-NEXT: v_ldexp_f32 v5, v2, 12 +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v8 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v0 +; VI-NEXT: v_not_b32_e32 v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v7 +; VI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 +; VI-NEXT: v_rcp_f32_e32 v10, v9 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; VI-NEXT: v_fma_f32 v10, v11, v10, v10 +; VI-NEXT: v_mul_f32_e32 v11, v6, v10 +; VI-NEXT: v_fma_f32 v12, -v9, v11, v6 +; VI-NEXT: v_fma_f32 v11, v12, v10, v11 +; VI-NEXT: v_fma_f32 v6, -v9, v11, v6 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v4 +; VI-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB3_7 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_sub_u32_e32 v4, vcc, v7, v8 +; VI-NEXT: v_add_u32_e32 v4, vcc, 12, v4 +; VI-NEXT: .LBB3_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mul_f32_e32 v5, v7, v6 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v5, -v5, v3, v7 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; VI-NEXT: v_add_f32_e32 v8, v5, v3 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, -12, v4 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v4 +; VI-NEXT: v_ldexp_f32 v5, v5, 12 +; VI-NEXT: s_cbranch_vccnz .LBB3_5 +; VI-NEXT: ; %bb.6: ; %Flow +; VI-NEXT: v_mov_b32_e32 v5, v7 +; VI-NEXT: .LBB3_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v4, vcc, -11, v4 +; VI-NEXT: v_ldexp_f32 v4, v5, v4 +; VI-NEXT: v_mul_f32_e32 v5, v4, v6 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v4, -v5, v3, v4 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; VI-NEXT: v_ldexp_f32 v2, v3, v2 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_bfi_b32 v2, s2, v2, v0 +; VI-NEXT: .LBB3_8: ; %Flow17 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: s_mov_b32 s0, 0x7f800000 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_nge_f32_e64 s[0:1], |v0|, s0 +; VI-NEXT: s_and_b64 vcc, s[0:1], vcc +; VI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: flat_store_dword v[3:4], v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: frem_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v2, s[2:3] +; GFX9-NEXT: global_load_dword v1, v2, s[6:7] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v1| +; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX9-NEXT: s_cbranch_vccz .LBB3_2 +; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_bfi_b32 v2, s2, 0, v0 +; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v1| +; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GFX9-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-NEXT: s_branch .LBB3_8 +; GFX9-NEXT: .LBB3_2: +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: .LBB3_3: ; %frem.compute +; GFX9-NEXT: v_frexp_mant_f32_e64 v3, |v1| +; GFX9-NEXT: v_ldexp_f32 v3, v3, 1 +; GFX9-NEXT: v_div_scale_f32 v9, s[2:3], v3, v3, 1.0 +; GFX9-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 +; GFX9-NEXT: v_frexp_mant_f32_e64 v2, |v0| +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v7, v0 +; GFX9-NEXT: v_ldexp_f32 v5, v2, 12 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v8, v1 +; GFX9-NEXT: v_add_u32_e32 v2, -1, v8 +; GFX9-NEXT: v_not_b32_e32 v4, v2 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 +; GFX9-NEXT: v_rcp_f32_e32 v10, v9 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; GFX9-NEXT: v_fma_f32 v10, v11, v10, v10 +; GFX9-NEXT: v_mul_f32_e32 v11, v6, v10 +; GFX9-NEXT: v_fma_f32 v12, -v9, v11, v6 +; GFX9-NEXT: v_fma_f32 v11, v12, v10, v11 +; GFX9-NEXT: v_fma_f32 v6, -v9, v11, v6 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX9-NEXT: v_div_fmas_f32 v6, v6, v10, v11 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v4 +; GFX9-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB3_7 +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: v_sub_u32_e32 v4, v7, v8 +; GFX9-NEXT: v_add_u32_e32 v4, 12, v4 +; GFX9-NEXT: .LBB3_5: ; %frem.loop_body +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mul_f32_e32 v5, v7, v6 +; GFX9-NEXT: v_rndne_f32_e32 v5, v5 +; GFX9-NEXT: v_fma_f32 v5, -v5, v3, v7 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; GFX9-NEXT: v_add_f32_e32 v8, v5, v3 +; GFX9-NEXT: v_add_u32_e32 v4, -12, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 12, v4 +; GFX9-NEXT: v_ldexp_f32 v5, v5, 12 +; GFX9-NEXT: s_cbranch_vccnz .LBB3_5 +; GFX9-NEXT: ; %bb.6: ; %Flow +; GFX9-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-NEXT: .LBB3_7: ; %frem.loop_exit +; GFX9-NEXT: v_add_u32_e32 v4, -11, v4 +; GFX9-NEXT: v_ldexp_f32 v4, v5, v4 +; GFX9-NEXT: v_mul_f32_e32 v5, v4, v6 +; GFX9-NEXT: v_rndne_f32_e32 v5, v5 +; GFX9-NEXT: v_fma_f32 v4, -v5, v3, v4 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX9-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_bfi_b32 v2, s2, v2, v0 +; GFX9-NEXT: .LBB3_8: ; %Flow17 +; GFX9-NEXT: s_mov_b32 s2, 0x7f800000 +; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_nge_f32_e64 s[2:3], |v0|, s2 +; GFX9-NEXT: s_and_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7fc00000 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: frem_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v0, v2, s[2:3] +; GFX10-NEXT: global_load_dword v1, v2, s[6:7] offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v1| +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_vccz .LBB3_2 +; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: v_bfi_b32 v2, 0x7fffffff, 0, v0 +; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v1| +; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB3_3 +; GFX10-NEXT: s_branch .LBB3_8 +; GFX10-NEXT: .LBB3_2: +; GFX10-NEXT: ; implicit-def: $vgpr2 +; GFX10-NEXT: .LBB3_3: ; %frem.compute +; GFX10-NEXT: v_frexp_mant_f32_e64 v3, |v1| +; GFX10-NEXT: v_frexp_mant_f32_e64 v2, |v0| +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v5, v0 +; GFX10-NEXT: v_ldexp_f32 v3, v3, 1 +; GFX10-NEXT: v_ldexp_f32 v4, v2, 12 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v2, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v5 +; GFX10-NEXT: v_div_scale_f32 v7, s4, v3, v3, 1.0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX10-NEXT: v_rcp_f32_e32 v8, v7 +; GFX10-NEXT: v_not_b32_e32 v6, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v8, v9, v8 +; GFX10-NEXT: v_mul_f32_e32 v9, v5, v8 +; GFX10-NEXT: v_fma_f32 v10, -v7, v9, v5 +; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX10-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX10-NEXT: s_denorm_mode 12 +; GFX10-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v6 +; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX10-NEXT: s_cbranch_vccnz .LBB3_7 +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s2, s2, 12 +; GFX10-NEXT: .LBB3_5: ; %frem.loop_body +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v7, v4 +; GFX10-NEXT: s_add_i32 s2, s2, -12 +; GFX10-NEXT: s_cmp_gt_i32 s2, 12 +; GFX10-NEXT: v_mul_f32_e32 v4, v7, v5 +; GFX10-NEXT: v_rndne_f32_e32 v4, v4 +; GFX10-NEXT: v_fma_f32 v4, -v4, v3, v7 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v4, v4, 12 +; GFX10-NEXT: s_cbranch_scc1 .LBB3_5 +; GFX10-NEXT: ; %bb.6: ; %Flow +; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-NEXT: .LBB3_7: ; %frem.loop_exit +; GFX10-NEXT: v_add_nc_u32_e32 v6, -11, v6 +; GFX10-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX10-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX10-NEXT: v_rndne_f32_e32 v5, v5 +; GFX10-NEXT: v_fma_f32 v4, -v5, v3, v4 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX10-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v0 +; GFX10-NEXT: .LBB3_8: ; %Flow17 +; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x7f800000, |v0| +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo +; GFX10-NEXT: global_store_dword v3, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: frem_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v0, v1, s[2:3] +; GFX11-NEXT: global_load_b32 v1, v1, s[4:5] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v1| +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccz .LBB3_2 +; GFX11-NEXT: ; %bb.1: ; %frem.else +; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, 0, v0 +; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v1| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo +; GFX11-NEXT: s_cbranch_execz .LBB3_3 +; GFX11-NEXT: s_branch .LBB3_8 +; GFX11-NEXT: .LBB3_2: +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: .LBB3_3: ; %frem.compute +; GFX11-NEXT: v_frexp_mant_f32_e64 v3, |v1| +; GFX11-NEXT: v_frexp_mant_f32_e64 v2, |v0| +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v5, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_ldexp_f32 v3, v3, 1 +; GFX11-NEXT: v_ldexp_f32 v4, v2, 12 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_readfirstlane_b32 s2, v5 +; GFX11-NEXT: v_div_scale_f32 v7, null, v3, v3, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX11-NEXT: v_rcp_f32_e32 v8, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v6, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 +; GFX11-NEXT: s_denorm_mode 15 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v8, v9, v8 +; GFX11-NEXT: v_mul_f32_e32 v9, v5, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v10, -v7, v9, v5 +; GFX11-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX11-NEXT: s_denorm_mode 12 +; GFX11-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_7 +; GFX11-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s2, s2, 12 +; GFX11-NEXT: .LBB3_5: ; %frem.loop_body +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v7, v4 +; GFX11-NEXT: s_add_i32 s2, s2, -12 +; GFX11-NEXT: s_cmp_gt_i32 s2, 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v4, v7, v5 +; GFX11-NEXT: v_rndne_f32_e32 v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v4, -v4, v3, v7 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_ldexp_f32 v4, v4, 12 +; GFX11-NEXT: s_cbranch_scc1 .LBB3_5 +; GFX11-NEXT: ; %bb.6: ; %Flow +; GFX11-NEXT: v_mov_b32_e32 v6, s2 +; GFX11-NEXT: v_mov_b32_e32 v4, v7 +; GFX11-NEXT: .LBB3_7: ; %frem.loop_exit +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v6, -11, v6 +; GFX11-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX11-NEXT: v_rndne_f32_e32 v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v4, -v5, v3, v4 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX11-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX11-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v0 +; GFX11-NEXT: .LBB3_8: ; %Flow17 +; GFX11-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x7f800000, |v0| +; GFX11-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, 0x7fc00000, v2 +; GFX11-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: frem_f32: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX1150-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: global_load_b32 v2, v0, s[10:11] +; GFX1150-NEXT: global_load_b32 v0, v0, s[0:1] offset:16 +; GFX1150-NEXT: s_waitcnt vmcnt(1) +; GFX1150-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2 +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_and_b32_e32 v3, 0x7fffffff, v0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3 +; GFX1150-NEXT: s_cbranch_vccz .LBB3_2 +; GFX1150-NEXT: ; %bb.1: ; %frem.else +; GFX1150-NEXT: v_bfi_b32 v4, 0x7fffffff, 0, v2 +; GFX1150-NEXT: v_cmp_eq_f32_e32 vcc_lo, v1, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1150-NEXT: v_cndmask_b32_e32 v3, v2, v4, vcc_lo +; GFX1150-NEXT: s_cbranch_execz .LBB3_3 +; GFX1150-NEXT: s_branch .LBB3_8 +; GFX1150-NEXT: .LBB3_2: +; GFX1150-NEXT: ; implicit-def: $vgpr3 +; GFX1150-NEXT: .LBB3_3: ; %frem.compute +; GFX1150-NEXT: v_frexp_mant_f32_e64 v4, |v0| +; GFX1150-NEXT: v_frexp_mant_f32_e64 v3, |v2| +; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v6, v2 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_ldexp_f32 v4, v4, 1 +; GFX1150-NEXT: v_ldexp_f32 v5, v3, 12 +; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1150-NEXT: v_div_scale_f32 v8, null, v4, v4, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1150-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; GFX1150-NEXT: v_rcp_f32_e32 v9, v8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_not_b32_e32 v7, v3 +; GFX1150-NEXT: v_add_nc_u32_e32 v7, v7, v6 +; GFX1150-NEXT: v_div_scale_f32 v6, vcc_lo, 1.0, v4, 1.0 +; GFX1150-NEXT: s_denorm_mode 15 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v10, -v8, v9, 1.0 +; GFX1150-NEXT: v_fmac_f32_e32 v9, v10, v9 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v10, v6, v9 +; GFX1150-NEXT: v_fma_f32 v11, -v8, v10, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f32_e32 v10, v11, v9 +; GFX1150-NEXT: v_fma_f32 v6, -v8, v10, v6 +; GFX1150-NEXT: s_denorm_mode 12 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-NEXT: v_div_fmas_f32 v6, v6, v9, v10 +; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v7 +; GFX1150-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; GFX1150-NEXT: s_cbranch_vccnz .LBB3_7 +; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-NEXT: s_sub_i32 s0, s0, s1 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_add_i32 s0, s0, 12 +; GFX1150-NEXT: .LBB3_5: ; %frem.loop_body +; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-NEXT: v_mov_b32_e32 v8, v5 +; GFX1150-NEXT: s_add_i32 s0, s0, -12 +; GFX1150-NEXT: s_cmp_gt_i32 s0, 12 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v5, v8, v6 +; GFX1150-NEXT: v_rndne_f32_e32 v5, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1150-NEXT: v_fma_f32 v5, v5, v4, v8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1150-NEXT: v_add_f32_e32 v7, v5, v4 +; GFX1150-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_ldexp_f32 v5, v5, 12 +; GFX1150-NEXT: s_cbranch_scc1 .LBB3_5 +; GFX1150-NEXT: ; %bb.6: ; %Flow +; GFX1150-NEXT: v_mov_b32_e32 v7, s0 +; GFX1150-NEXT: v_mov_b32_e32 v5, v8 +; GFX1150-NEXT: .LBB3_7: ; %frem.loop_exit +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_add_nc_u32_e32 v7, -11, v7 +; GFX1150-NEXT: v_ldexp_f32 v5, v5, v7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v6, v5, v6 +; GFX1150-NEXT: v_rndne_f32_e32 v6, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 +; GFX1150-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1150-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX1150-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_ldexp_f32 v3, v4, v3 +; GFX1150-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v2 +; GFX1150-NEXT: .LBB3_8: ; %Flow17 +; GFX1150-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v0 +; GFX1150-NEXT: v_cmp_nle_f32_e64 s0, 0x7f800000, v1 +; GFX1150-NEXT: v_mov_b32_e32 v2, 0 +; GFX1150-NEXT: s_and_b32 vcc_lo, s0, vcc_lo +; GFX1150-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v3, vcc_lo +; GFX1150-NEXT: global_store_b32 v2, v0, s[8:9] +; GFX1150-NEXT: s_endpgm +; +; GFX1200-LABEL: frem_f32: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1200-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX1200-NEXT: v_mov_b32_e32 v0, 0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: global_load_b32 v2, v0, s[10:11] +; GFX1200-NEXT: global_load_b32 v0, v0, s[0:1] offset:16 +; GFX1200-NEXT: s_wait_loadcnt 0x1 +; GFX1200-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2 +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_and_b32_e32 v3, 0x7fffffff, v0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3 +; GFX1200-NEXT: s_cbranch_vccz .LBB3_2 +; GFX1200-NEXT: ; %bb.1: ; %frem.else +; GFX1200-NEXT: v_bfi_b32 v4, 0x7fffffff, 0, v2 +; GFX1200-NEXT: v_cmp_eq_f32_e32 vcc_lo, v1, v3 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1200-NEXT: v_cndmask_b32_e32 v3, v2, v4, vcc_lo +; GFX1200-NEXT: s_cbranch_execz .LBB3_3 +; GFX1200-NEXT: s_branch .LBB3_8 +; GFX1200-NEXT: .LBB3_2: +; GFX1200-NEXT: ; implicit-def: $vgpr3 +; GFX1200-NEXT: .LBB3_3: ; %frem.compute +; GFX1200-NEXT: v_frexp_mant_f32_e64 v4, |v0| +; GFX1200-NEXT: v_frexp_mant_f32_e64 v3, |v2| +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v6, v2 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_ldexp_f32 v4, v4, 1 +; GFX1200-NEXT: v_ldexp_f32 v5, v3, 12 +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1200-NEXT: v_div_scale_f32 v8, null, v4, v4, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1200-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; GFX1200-NEXT: v_rcp_f32_e32 v9, v8 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_not_b32_e32 v7, v3 +; GFX1200-NEXT: v_add_nc_u32_e32 v7, v7, v6 +; GFX1200-NEXT: v_div_scale_f32 v6, vcc_lo, 1.0, v4, 1.0 +; GFX1200-NEXT: s_denorm_mode 15 +; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v10, -v8, v9, 1.0 +; GFX1200-NEXT: v_fmac_f32_e32 v9, v10, v9 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f32_e32 v10, v6, v9 +; GFX1200-NEXT: v_fma_f32 v11, -v8, v10, v6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fmac_f32_e32 v10, v11, v9 +; GFX1200-NEXT: v_fma_f32 v6, -v8, v10, v6 +; GFX1200-NEXT: s_denorm_mode 12 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-NEXT: v_div_fmas_f32 v6, v6, v9, v10 +; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v7 +; GFX1200-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; GFX1200-NEXT: s_cbranch_vccnz .LBB3_7 +; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-NEXT: s_sub_co_i32 s0, s0, s1 +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_add_co_i32 s0, s0, 12 +; GFX1200-NEXT: .LBB3_5: ; %frem.loop_body +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1200-NEXT: v_mov_b32_e32 v8, v5 +; GFX1200-NEXT: s_add_co_i32 s0, s0, -12 +; GFX1200-NEXT: s_cmp_gt_i32 s0, 12 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f32_e32 v5, v8, v6 +; GFX1200-NEXT: v_rndne_f32_e32 v5, v5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1200-NEXT: v_fma_f32 v5, v5, v4, v8 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1200-NEXT: v_add_f32_e32 v7, v5, v4 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_ldexp_f32 v5, v5, 12 +; GFX1200-NEXT: s_cbranch_scc1 .LBB3_5 +; GFX1200-NEXT: ; %bb.6: ; %Flow +; GFX1200-NEXT: v_mov_b32_e32 v7, s0 +; GFX1200-NEXT: v_mov_b32_e32 v5, v8 +; GFX1200-NEXT: .LBB3_7: ; %frem.loop_exit +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_add_nc_u32_e32 v7, -11, v7 +; GFX1200-NEXT: v_ldexp_f32 v5, v5, v7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f32_e32 v6, v5, v6 +; GFX1200-NEXT: v_rndne_f32_e32 v6, v6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 +; GFX1200-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1200-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_ldexp_f32 v3, v4, v3 +; GFX1200-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v2 +; GFX1200-NEXT: .LBB3_8: ; %Flow17 +; GFX1200-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v0 +; GFX1200-NEXT: v_cmp_nle_f32_e64 s0, 0x7f800000, v1 +; GFX1200-NEXT: v_mov_b32_e32 v2, 0 +; GFX1200-NEXT: s_and_b32 vcc_lo, s0, vcc_lo +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v3, vcc_lo +; GFX1200-NEXT: global_store_b32 v2, v0, s[8:9] +; GFX1200-NEXT: s_endpgm + ptr addrspace(1) %in2) #0 { + %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 + %r0 = load float, ptr addrspace(1) %in1, align 4 + %r1 = load float, ptr addrspace(1) %gep2, align 4 + %r2 = frem float %r0, %r1 + store float %r2, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; SI-LABEL: fast_frem_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s0, s2 @@ -951,7 +2910,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; -; CI-LABEL: frem_f32: +; CI-LABEL: fast_frem_f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd @@ -987,7 +2946,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: frem_f32: +; VI-LABEL: fast_frem_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 @@ -1021,7 +2980,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; GFX9-LABEL: frem_f32: +; GFX9-LABEL: fast_frem_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 @@ -1048,7 +3007,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: frem_f32: +; GFX10-LABEL: fast_frem_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -1077,7 +3036,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: frem_f32: +; GFX11-LABEL: fast_frem_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1112,7 +3071,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; -; GFX1150-LABEL: frem_f32: +; GFX1150-LABEL: fast_frem_f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 ; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1148,7 +3107,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1150-NEXT: s_endpgm ; -; GFX1200-LABEL: frem_f32: +; GFX1200-LABEL: fast_frem_f32: ; GFX1200: ; %bb.0: ; GFX1200-NEXT: s_clause 0x1 ; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1187,13 +3146,13 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 %r0 = load float, ptr addrspace(1) %in1, align 4 %r1 = load float, ptr addrspace(1) %gep2, align 4 - %r2 = frem float %r0, %r1 + %r2 = frem fast float %r0, %r1 store float %r2, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, -; SI-LABEL: fast_frem_f32: +define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; SI-LABEL: unsafe_frem_f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd @@ -1211,14 +3170,25 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v2, v1 -; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 +; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 +; SI-NEXT: v_rcp_f32_e32 v4, v3 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; SI-NEXT: v_fma_f32 v4, v5, v4, v4 +; SI-NEXT: v_mul_f32_e32 v5, v2, v4 +; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 +; SI-NEXT: v_fma_f32 v5, v6, v4, v5 +; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; -; CI-LABEL: fast_frem_f32: +; CI-LABEL: unsafe_frem_f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd @@ -1236,14 +3206,25 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; CI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_rcp_f32_e32 v2, v1 -; CI-NEXT: v_mul_f32_e32 v2, v0, v2 +; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 +; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 +; CI-NEXT: v_rcp_f32_e32 v4, v3 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; CI-NEXT: v_fma_f32 v4, v5, v4, v4 +; CI-NEXT: v_mul_f32_e32 v5, v2, v4 +; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 +; CI-NEXT: v_fma_f32 v5, v6, v4, v5 +; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: fast_frem_f32: +; VI-LABEL: unsafe_frem_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 @@ -1259,14 +3240,25 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_load_dword v2, v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v3, v2 -; VI-NEXT: v_mul_f32_e32 v3, v4, v3 +; VI-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v4 +; VI-NEXT: v_div_scale_f32 v3, vcc, v4, v2, v4 +; VI-NEXT: v_rcp_f32_e32 v6, v5 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; VI-NEXT: v_fma_f32 v6, v7, v6, v6 +; VI-NEXT: v_mul_f32_e32 v7, v3, v6 +; VI-NEXT: v_fma_f32 v8, -v5, v7, v3 +; VI-NEXT: v_fma_f32 v7, v8, v6, v7 +; VI-NEXT: v_fma_f32 v3, -v5, v7, v3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; VI-NEXT: v_div_fixup_f32 v3, v3, v2, v4 ; VI-NEXT: v_trunc_f32_e32 v3, v3 ; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; GFX9-LABEL: fast_frem_f32: +; GFX9-LABEL: unsafe_frem_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 @@ -1275,14 +3267,25 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_rcp_f32_e32 v3, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX9-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, v1 +; GFX9-NEXT: v_div_scale_f32 v3, vcc, v1, v2, v1 +; GFX9-NEXT: v_rcp_f32_e32 v5, v4 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX9-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX9-NEXT: v_fma_f32 v5, v6, v5, v5 +; GFX9-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX9-NEXT: v_fma_f32 v7, -v4, v6, v3 +; GFX9-NEXT: v_fma_f32 v6, v7, v5, v6 +; GFX9-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX9-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: fast_frem_f32: +; GFX10-LABEL: unsafe_frem_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -1293,14 +3296,25 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX10-NEXT: v_div_scale_f32 v4, s2, v2, v2, v1 +; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 +; GFX10-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX10-NEXT: v_fma_f32 v7, -v4, v6, v3 +; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v5 +; GFX10-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX10-NEXT: s_denorm_mode 12 +; GFX10-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v1 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3 ; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: fast_frem_f32: +; GFX11-LABEL: unsafe_frem_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1311,16 +3325,31 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-NEXT: v_div_scale_f32 v4, null, v2, v2, v1 +; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f32_e32 v5, v4 +; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX11-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX11-NEXT: v_fma_f32 v7, -v4, v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v5 +; GFX11-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX11-NEXT: s_denorm_mode 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; GFX11-NEXT: v_div_fixup_f32 v3, v3, v2, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v3, v3 ; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; -; GFX1150-LABEL: fast_frem_f32: +; GFX1150-LABEL: unsafe_frem_f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 ; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1331,9 +3360,24 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1150-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 ; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_rcp_f32_e32 v3, v2 -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX1150-NEXT: v_div_scale_f32 v4, null, v2, v2, v1 +; GFX1150-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_rcp_f32_e32 v5, v4 +; GFX1150-NEXT: s_denorm_mode 15 +; GFX1150-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX1150-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v7, -v4, v6, v3 +; GFX1150-NEXT: v_fmac_f32_e32 v6, v7, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX1150-NEXT: s_denorm_mode 12 +; GFX1150-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v2, v1 ; GFX1150-NEXT: v_trunc_f32_e32 v3, v3 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 @@ -1341,7 +3385,7 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1150-NEXT: s_endpgm ; -; GFX1200-LABEL: fast_frem_f32: +; GFX1200-LABEL: unsafe_frem_f32: ; GFX1200: ; %bb.0: ; GFX1200-NEXT: s_clause 0x1 ; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1352,51 +3396,180 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1200-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1200-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 ; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: v_rcp_f32_e32 v3, v2 -; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX1200-NEXT: v_div_scale_f32 v4, null, v2, v2, v1 +; GFX1200-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1200-NEXT: v_rcp_f32_e32 v5, v4 +; GFX1200-NEXT: s_denorm_mode 15 +; GFX1200-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX1200-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v7, -v4, v6, v3 +; GFX1200-NEXT: v_fmac_f32_e32 v6, v7, v5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX1200-NEXT: s_denorm_mode 12 +; GFX1200-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v2, v1 ; GFX1200-NEXT: v_trunc_f32_e32 v3, v3 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2 ; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1200-NEXT: s_endpgm - ptr addrspace(1) %in2) #0 { + ptr addrspace(1) %in2) #1 { %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 %r0 = load float, ptr addrspace(1) %in1, align 4 %r1 = load float, ptr addrspace(1) %gep2, align 4 - %r2 = frem fast float %r0, %r1 + %r2 = frem afn float %r0, %r1 store float %r2, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, -; SI-LABEL: unsafe_frem_f32: +define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; SI-LABEL: frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s0, s2 -; SI-NEXT: s_mov_b32 s1, s3 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b32 s4, s10 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v2, v1 -; SI-NEXT: v_mul_f32_e32 v2, v0, v2 -; SI-NEXT: v_trunc_f32_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: v_cmp_ngt_f64_e64 s[0:1], |v[0:1]|, |v[2:3]| +; SI-NEXT: s_and_b64 vcc, exec, s[0:1] +; SI-NEXT: s_cbranch_vccz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[2:3]| +; SI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc +; SI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB6_3 +; SI-NEXT: s_branch .LBB6_8 +; SI-NEXT: .LBB6_2: +; SI-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB6_3: ; %frem.compute +; SI-NEXT: s_brev_b32 s5, -2 +; SI-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s1, 0x7ff00000 +; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[0:1] +; SI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; SI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; SI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] +; SI-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s2, v6 +; SI-NEXT: s_cselect_b32 s3, s2, 0 +; SI-NEXT: v_ldexp_f64 v[6:7], v[4:5], 26 +; SI-NEXT: v_and_b32_e32 v8, 0x7fffffff, v3 +; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[0:1] +; SI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[2:3]| +; SI-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc +; SI-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s0, v8 +; SI-NEXT: s_cselect_b32 s7, s0, 0 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_add_i32 s4, s7, -1 +; SI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 1 +; SI-NEXT: s_not_b32 s0, s4 +; SI-NEXT: s_add_i32 s6, s0, s3 +; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[4:5], v[4:5], 1.0 +; SI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; SI-NEXT: v_div_scale_f64 v[12:13], s[0:1], 1.0, v[4:5], 1.0 +; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; SI-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v9 +; SI-NEXT: s_mov_b32 s0, 0x3ff00000 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, v13 +; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc +; SI-NEXT: s_nop 0 +; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15] +; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[4:5], 1.0 +; SI-NEXT: s_cmp_lt_i32 s6, 27 +; SI-NEXT: s_cbranch_scc1 .LBB6_7 +; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: s_sub_i32 s0, s3, s7 +; SI-NEXT: s_add_i32 s6, s0, 26 +; SI-NEXT: s_mov_b32 s3, 0x432fffff +; SI-NEXT: v_mov_b32_e32 v14, 0x43300000 +; SI-NEXT: v_mov_b32_e32 v10, 0 +; SI-NEXT: .LBB6_5: ; %frem.loop_body +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v13, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mul_f64 v[6:7], v[12:13], v[8:9] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[6:7]|, s[2:3] +; SI-NEXT: v_bfi_b32 v11, s5, v14, v7 +; SI-NEXT: v_add_f64 v[15:16], v[6:7], v[10:11] +; SI-NEXT: v_add_f64 v[15:16], v[15:16], -v[10:11] +; SI-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v15, v6, vcc +; SI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[4:5], v[12:13] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; SI-NEXT: v_add_f64 v[15:16], v[6:7], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v16, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v15, vcc +; SI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; SI-NEXT: s_sub_i32 s6, s6, 26 +; SI-NEXT: s_cmp_gt_i32 s6, 26 +; SI-NEXT: s_cbranch_scc1 .LBB6_5 +; SI-NEXT: ; %bb.6: ; %Flow +; SI-NEXT: v_mov_b32_e32 v6, v12 +; SI-NEXT: v_mov_b32_e32 v7, v13 +; SI-NEXT: .LBB6_7: ; %frem.loop_exit +; SI-NEXT: s_sub_i32 s0, s6, 25 +; SI-NEXT: v_ldexp_f64 v[6:7], v[6:7], s0 +; SI-NEXT: v_mul_f64 v[8:9], v[6:7], v[8:9] +; SI-NEXT: s_mov_b32 s0, -1 +; SI-NEXT: s_mov_b32 s1, 0x432fffff +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[8:9]|, s[0:1] +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_mov_b32_e32 v10, 0x43300000 +; SI-NEXT: v_bfi_b32 v11, s0, v10, v9 +; SI-NEXT: v_mov_b32_e32 v10, 0 +; SI-NEXT: v_add_f64 v[12:13], v[8:9], v[10:11] +; SI-NEXT: v_add_f64 v[10:11], v[12:13], -v[10:11] +; SI-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc +; SI-NEXT: v_fma_f64 v[6:7], -v[8:9], v[4:5], v[6:7] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; SI-NEXT: v_add_f64 v[4:5], v[6:7], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; SI-NEXT: v_ldexp_f64 v[4:5], v[4:5], s4 +; SI-NEXT: v_bfi_b32 v5, s0, v5, v1 +; SI-NEXT: .LBB6_8: ; %Flow17 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, 0, v[2:3] +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s1, 0x7ff00000 +; SI-NEXT: v_cmp_nge_f64_e64 s[0:1], |v[0:1]|, s[0:1] +; SI-NEXT: s_and_b64 vcc, s[0:1], vcc +; SI-NEXT: v_mov_b32_e32 v0, 0x7ff80000 +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; -; CI-LABEL: unsafe_frem_f32: +; CI-LABEL: frem_f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd @@ -1404,152 +3577,707 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s6, s10 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s0 -; CI-NEXT: s_mov_b32 s9, s1 -; CI-NEXT: s_mov_b32 s0, s2 -; CI-NEXT: s_mov_b32 s1, s3 -; CI-NEXT: s_mov_b32 s2, s10 -; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s7, s11 -; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; CI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_rcp_f32_e32 v2, v1 -; CI-NEXT: v_mul_f32_e32 v2, v0, v2 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]| +; CI-NEXT: s_and_b64 vcc, exec, s[2:3] +; CI-NEXT: s_cbranch_vccz .LBB6_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[2:3]| +; CI-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; CI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc +; CI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc +; CI-NEXT: s_cbranch_execz .LBB6_3 +; CI-NEXT: s_branch .LBB6_8 +; CI-NEXT: .LBB6_2: +; CI-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CI-NEXT: .LBB6_3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; CI-NEXT: v_frexp_exp_i32_f64_e32 v11, v[2:3] +; CI-NEXT: v_frexp_exp_i32_f64_e32 v10, v[0:1] +; CI-NEXT: v_ldexp_f64 v[6:7], v[4:5], 26 +; CI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[2:3]| +; CI-NEXT: v_add_i32_e32 v12, vcc, -1, v11 +; CI-NEXT: v_not_b32_e32 v8, v12 +; CI-NEXT: v_add_i32_e32 v13, vcc, v8, v10 +; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 1 +; CI-NEXT: v_div_scale_f64 v[8:9], s[2:3], v[4:5], v[4:5], 1.0 +; CI-NEXT: v_rcp_f64_e32 v[14:15], v[8:9] +; CI-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], 1.0 +; CI-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; CI-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], 1.0 +; CI-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; CI-NEXT: v_div_scale_f64 v[16:17], vcc, 1.0, v[4:5], 1.0 +; CI-NEXT: v_mul_f64 v[18:19], v[16:17], v[14:15] +; CI-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] +; CI-NEXT: s_nop 1 +; CI-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[14:15], v[18:19] +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v13 +; CI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[4:5], 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB6_7 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_sub_i32_e32 v10, vcc, v10, v11 +; CI-NEXT: v_add_i32_e32 v13, vcc, 26, v10 +; CI-NEXT: .LBB6_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v11, v7 +; CI-NEXT: v_mov_b32_e32 v10, v6 +; CI-NEXT: v_mul_f64 v[6:7], v[10:11], v[8:9] +; CI-NEXT: v_rndne_f64_e32 v[6:7], v[6:7] +; CI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[4:5], v[10:11] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; CI-NEXT: v_add_f64 v[14:15], v[6:7], v[4:5] +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc +; CI-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc +; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; CI-NEXT: v_subrev_i32_e32 v13, vcc, 26, v13 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v13 +; CI-NEXT: s_cbranch_vccnz .LBB6_5 +; CI-NEXT: ; %bb.6: ; %Flow +; CI-NEXT: v_mov_b32_e32 v6, v10 +; CI-NEXT: v_mov_b32_e32 v7, v11 +; CI-NEXT: .LBB6_7: ; %frem.loop_exit +; CI-NEXT: v_subrev_i32_e32 v10, vcc, 25, v13 +; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], v10 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_mul_f64 v[8:9], v[6:7], v[8:9] +; CI-NEXT: v_rndne_f64_e32 v[8:9], v[8:9] +; CI-NEXT: v_fma_f64 v[6:7], -v[8:9], v[4:5], v[6:7] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; CI-NEXT: v_add_f64 v[4:5], v[6:7], v[4:5] +; CI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; CI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], v12 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v1 +; CI-NEXT: .LBB6_8: ; %Flow17 +; CI-NEXT: s_mov_b32 s4, 0 +; CI-NEXT: s_mov_b32 s5, 0x7ff00000 +; CI-NEXT: v_cmp_lg_f64_e32 vcc, 0, v[2:3] +; CI-NEXT: v_cmp_nge_f64_e64 s[4:5], |v[0:1]|, s[4:5] +; CI-NEXT: v_mov_b32_e32 v0, 0x7ff80000 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_and_b64 vcc, s[4:5], vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v0, v5, vcc +; CI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: unsafe_frem_f32: +; VI-LABEL: frem_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_add_u32 s0, s4, 16 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: flat_load_dword v4, v[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v3, v2 -; VI-NEXT: v_mul_f32_e32 v3, v4, v3 -; VI-NEXT: v_trunc_f32_e32 v3, v3 -; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]| +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] +; VI-NEXT: s_cbranch_vccz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[2:3]| +; VI-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; VI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc +; VI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc +; VI-NEXT: s_cbranch_execz .LBB6_3 +; VI-NEXT: s_branch .LBB6_8 +; VI-NEXT: .LBB6_2: +; VI-NEXT: ; implicit-def: $vgpr4_vgpr5 +; VI-NEXT: .LBB6_3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; VI-NEXT: v_frexp_exp_i32_f64_e32 v11, v[2:3] +; VI-NEXT: v_frexp_exp_i32_f64_e32 v10, v[0:1] +; VI-NEXT: v_ldexp_f64 v[6:7], v[4:5], 26 +; VI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[2:3]| +; VI-NEXT: v_add_u32_e32 v12, vcc, -1, v11 +; VI-NEXT: v_not_b32_e32 v8, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, v8, v10 +; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 1 +; VI-NEXT: v_div_scale_f64 v[8:9], s[2:3], v[4:5], v[4:5], 1.0 +; VI-NEXT: v_rcp_f64_e32 v[14:15], v[8:9] +; VI-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], 1.0 +; VI-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; VI-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], 1.0 +; VI-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; VI-NEXT: v_div_scale_f64 v[16:17], vcc, 1.0, v[4:5], 1.0 +; VI-NEXT: v_mul_f64 v[18:19], v[16:17], v[14:15] +; VI-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] +; VI-NEXT: s_nop 1 +; VI-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[14:15], v[18:19] +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v13 +; VI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[4:5], 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB6_7 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_sub_u32_e32 v10, vcc, v10, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 26, v10 +; VI-NEXT: .LBB6_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v11, v7 +; VI-NEXT: v_mov_b32_e32 v10, v6 +; VI-NEXT: v_mul_f64 v[6:7], v[10:11], v[8:9] +; VI-NEXT: v_rndne_f64_e32 v[6:7], v[6:7] +; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[4:5], v[10:11] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_add_f64 v[14:15], v[6:7], v[4:5] +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc +; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; VI-NEXT: v_subrev_u32_e32 v13, vcc, 26, v13 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v13 +; VI-NEXT: s_cbranch_vccnz .LBB6_5 +; VI-NEXT: ; %bb.6: ; %Flow +; VI-NEXT: v_mov_b32_e32 v6, v10 +; VI-NEXT: v_mov_b32_e32 v7, v11 +; VI-NEXT: .LBB6_7: ; %frem.loop_exit +; VI-NEXT: v_subrev_u32_e32 v10, vcc, 25, v13 +; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], v10 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_mul_f64 v[8:9], v[6:7], v[8:9] +; VI-NEXT: v_rndne_f64_e32 v[8:9], v[8:9] +; VI-NEXT: v_fma_f64 v[6:7], -v[8:9], v[4:5], v[6:7] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_add_f64 v[4:5], v[6:7], v[4:5] +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], v12 +; VI-NEXT: v_bfi_b32 v5, s2, v5, v1 +; VI-NEXT: .LBB6_8: ; %Flow17 +; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_mov_b32 s1, 0x7ff00000 +; VI-NEXT: v_cmp_lg_f64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_nge_f64_e64 s[0:1], |v[0:1]|, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, 0x7ff80000 +; VI-NEXT: s_and_b64 vcc, s[0:1], vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_endpgm ; -; GFX9-LABEL: unsafe_frem_f32: +; GFX9-LABEL: frem_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:16 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_rcp_f32_e32 v3, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]| +; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX9-NEXT: s_cbranch_vccz .LBB6_2 +; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[2:3]| +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc +; GFX9-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-NEXT: s_branch .LBB6_8 +; GFX9-NEXT: .LBB6_2: +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX9-NEXT: .LBB6_3: ; %frem.compute +; GFX9-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v11, v[2:3] +; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v10, v[0:1] +; GFX9-NEXT: v_ldexp_f64 v[6:7], v[4:5], 26 +; GFX9-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[2:3]| +; GFX9-NEXT: v_add_u32_e32 v12, -1, v11 +; GFX9-NEXT: v_not_b32_e32 v8, v12 +; GFX9-NEXT: v_add_u32_e32 v13, v8, v10 +; GFX9-NEXT: v_ldexp_f64 v[4:5], v[4:5], 1 +; GFX9-NEXT: v_div_scale_f64 v[8:9], s[2:3], v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[8:9] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], 1.0 +; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], 1.0 +; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; GFX9-NEXT: v_div_scale_f64 v[16:17], vcc, 1.0, v[4:5], 1.0 +; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[14:15] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[14:15], v[18:19] +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v13 +; GFX9-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[4:5], 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB6_7 +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: v_sub_u32_e32 v10, v10, v11 +; GFX9-NEXT: v_add_u32_e32 v13, 26, v10 +; GFX9-NEXT: .LBB6_5: ; %frem.loop_body +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-NEXT: v_mul_f64 v[6:7], v[10:11], v[8:9] +; GFX9-NEXT: v_subrev_u32_e32 v13, 26, v13 +; GFX9-NEXT: v_rndne_f64_e32 v[6:7], v[6:7] +; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[4:5], v[10:11] +; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_add_f64 v[14:15], v[6:7], v[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc +; GFX9-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 26, v13 +; GFX9-NEXT: s_cbranch_vccnz .LBB6_5 +; GFX9-NEXT: ; %bb.6: ; %Flow +; GFX9-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-NEXT: .LBB6_7: ; %frem.loop_exit +; GFX9-NEXT: v_subrev_u32_e32 v10, 25, v13 +; GFX9-NEXT: v_ldexp_f64 v[6:7], v[6:7], v10 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_mul_f64 v[8:9], v[6:7], v[8:9] +; GFX9-NEXT: v_rndne_f64_e32 v[8:9], v[8:9] +; GFX9-NEXT: v_fma_f64 v[6:7], -v[8:9], v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_add_f64 v[4:5], v[6:7], v[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9-NEXT: v_ldexp_f64 v[4:5], v[4:5], v12 +; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v1 +; GFX9-NEXT: .LBB6_8: ; %Flow17 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_mov_b32 s3, 0x7ff00000 +; GFX9-NEXT: v_cmp_lg_f64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_nge_f64_e64 s[2:3], |v[0:1]|, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7ff80000 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_and_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: unsafe_frem_f32: +; GFX10-LABEL: frem_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:16 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 -; GFX10-NEXT: v_trunc_f32_e32 v3, v3 -; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[2:3]| +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_vccz .LBB6_2 +; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[2:3]| +; GFX10-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB6_3 +; GFX10-NEXT: s_branch .LBB6_8 +; GFX10-NEXT: .LBB6_2: +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: .LBB6_3: ; %frem.compute +; GFX10-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v9, v[2:3] +; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v8, v[0:1] +; GFX10-NEXT: v_ldexp_f64 v[6:7], v[4:5], 26 +; GFX10-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[2:3]| +; GFX10-NEXT: v_add_nc_u32_e32 v12, -1, v9 +; GFX10-NEXT: v_readfirstlane_b32 s3, v9 +; GFX10-NEXT: v_readfirstlane_b32 s2, v8 +; GFX10-NEXT: v_not_b32_e32 v9, v12 +; GFX10-NEXT: v_add_nc_u32_e32 v13, v9, v8 +; GFX10-NEXT: v_ldexp_f64 v[4:5], v[4:5], 1 +; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX10-NEXT: v_div_scale_f64 v[14:15], vcc_lo, 1.0, v[4:5], 1.0 +; GFX10-NEXT: v_mul_f64 v[16:17], v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[14:15] +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[16:17] +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v13 +; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[4:5], 1.0 +; GFX10-NEXT: s_cbranch_vccnz .LBB6_7 +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s2, s2, 26 +; GFX10-NEXT: .LBB6_5: ; %frem.loop_body +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v11, v7 +; GFX10-NEXT: v_mov_b32_e32 v10, v6 +; GFX10-NEXT: s_sub_i32 s2, s2, 26 +; GFX10-NEXT: s_cmp_gt_i32 s2, 26 +; GFX10-NEXT: v_mul_f64 v[6:7], v[10:11], v[8:9] +; GFX10-NEXT: v_rndne_f64_e32 v[6:7], v[6:7] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[4:5], v[10:11] +; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX10-NEXT: v_add_f64 v[13:14], v[6:7], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX10-NEXT: s_cbranch_scc1 .LBB6_5 +; GFX10-NEXT: ; %bb.6: ; %Flow +; GFX10-NEXT: v_mov_b32_e32 v6, v10 +; GFX10-NEXT: v_mov_b32_e32 v13, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, v11 +; GFX10-NEXT: .LBB6_7: ; %frem.loop_exit +; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 25, v13 +; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], v10 +; GFX10-NEXT: v_mul_f64 v[8:9], v[6:7], v[8:9] +; GFX10-NEXT: v_rndne_f64_e32 v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[8:9], v[4:5], v[6:7] +; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[4:5], v[4:5], v12 +; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 +; GFX10-NEXT: .LBB6_8: ; %Flow17 +; GFX10-NEXT: v_cmp_lg_f64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x7ff00000, |v[0:1]| +; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: unsafe_frem_f32: +; GFX11-LABEL: frem_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: global_load_b64 v[2:3], v2, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[2:3]| +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccz .LBB6_2 +; GFX11-NEXT: ; %bb.1: ; %frem.else +; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[2:3]| +; GFX11-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo +; GFX11-NEXT: s_cbranch_execz .LBB6_3 +; GFX11-NEXT: s_branch .LBB6_8 +; GFX11-NEXT: .LBB6_2: +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: .LBB6_3: ; %frem.compute +; GFX11-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v9, v[2:3] +; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v8, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_ldexp_f64 v[6:7], v[4:5], 26 +; GFX11-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[2:3]| +; GFX11-NEXT: v_add_nc_u32_e32 v12, -1, v9 +; GFX11-NEXT: v_readfirstlane_b32 s3, v9 +; GFX11-NEXT: v_readfirstlane_b32 s2, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v9, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v13, v9, v8 +; GFX11-NEXT: v_ldexp_f64 v[4:5], v[4:5], 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX11-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v3, v3 -; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX11-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX11-NEXT: v_div_scale_f64 v[14:15], vcc_lo, 1.0, v[4:5], 1.0 +; GFX11-NEXT: v_mul_f64 v[16:17], v[14:15], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[14:15] +; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[16:17] +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[4:5], 1.0 +; GFX11-NEXT: s_cbranch_vccnz .LBB6_7 +; GFX11-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s2, s2, 26 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB6_5: ; %frem.loop_body +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v11, v7 :: v_dual_mov_b32 v10, v6 +; GFX11-NEXT: s_sub_i32 s2, s2, 26 +; GFX11-NEXT: s_cmp_gt_i32 s2, 26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[6:7], v[10:11], v[8:9] +; GFX11-NEXT: v_rndne_f64_e32 v[6:7], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[4:5], v[10:11] +; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_add_f64 v[13:14], v[6:7], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v14 :: v_dual_cndmask_b32 v6, v6, v13 +; GFX11-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX11-NEXT: s_cbranch_scc1 .LBB6_5 +; GFX11-NEXT: ; %bb.6: ; %Flow +; GFX11-NEXT: v_dual_mov_b32 v13, s2 :: v_dual_mov_b32 v6, v10 +; GFX11-NEXT: v_mov_b32_e32 v7, v11 +; GFX11-NEXT: .LBB6_7: ; %frem.loop_exit +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 25, v13 +; GFX11-NEXT: v_ldexp_f64 v[6:7], v[6:7], v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[8:9], v[6:7], v[8:9] +; GFX11-NEXT: v_rndne_f64_e32 v[8:9], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[6:7], -v[8:9], v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v5 :: v_dual_cndmask_b32 v4, v6, v4 +; GFX11-NEXT: v_ldexp_f64 v[4:5], v[4:5], v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 +; GFX11-NEXT: .LBB6_8: ; %Flow17 +; GFX11-NEXT: v_cmp_lg_f64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x7ff00000, |v[0:1]| +; GFX11-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_cndmask_b32 v1, 0x7ff80000, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; -; GFX1150-LABEL: unsafe_frem_f32: +; GFX1150-LABEL: frem_f64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 ; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-NEXT: v_mov_b32_e32 v2, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1150-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 +; GFX1150-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX1150-NEXT: global_load_b64 v[2:3], v2, s[4:5] ; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_rcp_f32_e32 v3, v2 +; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[2:3]| +; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX1150-NEXT: s_cbranch_vccz .LBB6_2 +; GFX1150-NEXT: ; %bb.1: ; %frem.else +; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[2:3]| +; GFX1150-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo +; GFX1150-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo +; GFX1150-NEXT: s_cbranch_execz .LBB6_3 +; GFX1150-NEXT: s_branch .LBB6_8 +; GFX1150-NEXT: .LBB6_2: +; GFX1150-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1150-NEXT: .LBB6_3: ; %frem.compute +; GFX1150-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v9, v[2:3] +; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v8, v[0:1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[4:5], 26 +; GFX1150-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[2:3]| +; GFX1150-NEXT: v_add_nc_u32_e32 v12, -1, v9 +; GFX1150-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1150-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_not_b32_e32 v9, v12 +; GFX1150-NEXT: v_add_nc_u32_e32 v13, v9, v8 +; GFX1150-NEXT: v_ldexp_f64 v[4:5], v[4:5], 1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], 1.0 +; GFX1150-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f32_e32 v3, v1, v3 -; GFX1150-NEXT: v_trunc_f32_e32 v3, v3 +; GFX1150-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX1150-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2 -; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1150-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX1150-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX1150-NEXT: v_div_scale_f64 v[14:15], vcc_lo, 1.0, v[4:5], 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f64 v[16:17], v[14:15], v[10:11] +; GFX1150-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[14:15] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[16:17] +; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v13 +; GFX1150-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[4:5], 1.0 +; GFX1150-NEXT: s_cbranch_vccnz .LBB6_7 +; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-NEXT: s_sub_i32 s2, s2, s3 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_add_i32 s2, s2, 26 +; GFX1150-NEXT: .p2align 6 +; GFX1150-NEXT: .LBB6_5: ; %frem.loop_body +; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-NEXT: v_dual_mov_b32 v11, v7 :: v_dual_mov_b32 v10, v6 +; GFX1150-NEXT: s_sub_i32 s2, s2, 26 +; GFX1150-NEXT: s_cmp_gt_i32 s2, 26 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f64 v[6:7], v[10:11], v[8:9] +; GFX1150-NEXT: v_rndne_f64_e32 v[6:7], v[6:7] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[6:7], -v[6:7], v[4:5], v[10:11] +; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX1150-NEXT: v_add_f64 v[13:14], v[6:7], v[4:5] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_dual_cndmask_b32 v7, v7, v14 :: v_dual_cndmask_b32 v6, v6, v13 +; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX1150-NEXT: s_cbranch_scc1 .LBB6_5 +; GFX1150-NEXT: ; %bb.6: ; %Flow +; GFX1150-NEXT: v_dual_mov_b32 v13, s2 :: v_dual_mov_b32 v6, v10 +; GFX1150-NEXT: v_mov_b32_e32 v7, v11 +; GFX1150-NEXT: .LBB6_7: ; %frem.loop_exit +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_subrev_nc_u32_e32 v10, 25, v13 +; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[6:7], v10 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f64 v[8:9], v[6:7], v[8:9] +; GFX1150-NEXT: v_rndne_f64_e32 v[8:9], v[8:9] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[6:7], -v[8:9], v[4:5], v[6:7] +; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX1150-NEXT: v_add_f64 v[4:5], v[6:7], v[4:5] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_dual_cndmask_b32 v5, v7, v5 :: v_dual_cndmask_b32 v4, v6, v4 +; GFX1150-NEXT: v_ldexp_f64 v[4:5], v[4:5], v12 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 +; GFX1150-NEXT: .LBB6_8: ; %Flow17 +; GFX1150-NEXT: v_cmp_lg_f64_e32 vcc_lo, 0, v[2:3] +; GFX1150-NEXT: v_cmp_nle_f64_e64 s2, 0x7ff00000, |v[0:1]| +; GFX1150-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX1150-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_cndmask_b32 v1, 0x7ff80000, v5 +; GFX1150-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo +; GFX1150-NEXT: global_store_b64 v6, v[0:1], s[0:1] ; GFX1150-NEXT: s_endpgm ; -; GFX1200-LABEL: unsafe_frem_f32: +; GFX1200-LABEL: frem_f64: ; GFX1200: ; %bb.0: ; GFX1200-NEXT: s_clause 0x1 ; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1200-NEXT: v_mov_b32_e32 v0, 0 +; GFX1200-NEXT: v_mov_b32_e32 v2, 0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 ; GFX1200-NEXT: s_clause 0x1 -; GFX1200-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1200-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 +; GFX1200-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX1200-NEXT: global_load_b64 v[2:3], v2, s[4:5] ; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: v_rcp_f32_e32 v3, v2 +; GFX1200-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[2:3]| +; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX1200-NEXT: s_cbranch_vccz .LBB6_2 +; GFX1200-NEXT: ; %bb.1: ; %frem.else +; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[2:3]| +; GFX1200-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo +; GFX1200-NEXT: s_cbranch_execz .LBB6_3 +; GFX1200-NEXT: s_branch .LBB6_8 +; GFX1200-NEXT: .LBB6_2: +; GFX1200-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1200-NEXT: .LBB6_3: ; %frem.compute +; GFX1200-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| +; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v9, v[2:3] +; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v8, v[0:1] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[4:5], 26 +; GFX1200-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[2:3]| +; GFX1200-NEXT: v_add_nc_u32_e32 v12, -1, v9 +; GFX1200-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1200-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_not_b32_e32 v9, v12 +; GFX1200-NEXT: v_add_nc_u32_e32 v13, v9, v8 +; GFX1200-NEXT: v_ldexp_f64 v[4:5], v[4:5], 1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], 1.0 +; GFX1200-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_mul_f32_e32 v3, v1, v3 -; GFX1200-NEXT: v_trunc_f32_e32 v3, v3 +; GFX1200-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX1200-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2 -; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1200-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX1200-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX1200-NEXT: v_div_scale_f64 v[14:15], vcc_lo, 1.0, v[4:5], 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f64_e32 v[16:17], v[14:15], v[10:11] +; GFX1200-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[14:15] +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[16:17] +; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v13 +; GFX1200-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[4:5], 1.0 +; GFX1200-NEXT: s_cbranch_vccnz .LBB6_7 +; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-NEXT: s_sub_co_i32 s2, s2, s3 +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_add_co_i32 s2, s2, 26 +; GFX1200-NEXT: .LBB6_5: ; %frem.loop_body +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1200-NEXT: v_dual_mov_b32 v11, v7 :: v_dual_mov_b32 v10, v6 +; GFX1200-NEXT: s_sub_co_i32 s2, s2, 26 +; GFX1200-NEXT: s_cmp_gt_i32 s2, 26 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f64_e32 v[6:7], v[10:11], v[8:9] +; GFX1200-NEXT: v_rndne_f64_e32 v[6:7], v[6:7] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[6:7], -v[6:7], v[4:5], v[10:11] +; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX1200-NEXT: v_add_f64_e32 v[13:14], v[6:7], v[4:5] +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_dual_cndmask_b32 v7, v7, v14 :: v_dual_cndmask_b32 v6, v6, v13 +; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 +; GFX1200-NEXT: s_cbranch_scc1 .LBB6_5 +; GFX1200-NEXT: ; %bb.6: ; %Flow +; GFX1200-NEXT: v_dual_mov_b32 v13, s2 :: v_dual_mov_b32 v6, v10 +; GFX1200-NEXT: v_mov_b32_e32 v7, v11 +; GFX1200-NEXT: .LBB6_7: ; %frem.loop_exit +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_subrev_nc_u32_e32 v10, 25, v13 +; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[6:7], v10 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f64_e32 v[8:9], v[6:7], v[8:9] +; GFX1200-NEXT: v_rndne_f64_e32 v[8:9], v[8:9] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[6:7], -v[8:9], v[4:5], v[6:7] +; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] +; GFX1200-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[4:5] +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_dual_cndmask_b32 v5, v7, v5 :: v_dual_cndmask_b32 v4, v6, v4 +; GFX1200-NEXT: v_ldexp_f64 v[4:5], v[4:5], v12 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 +; GFX1200-NEXT: .LBB6_8: ; %Flow17 +; GFX1200-NEXT: v_cmp_lg_f64_e32 vcc_lo, 0, v[2:3] +; GFX1200-NEXT: v_cmp_nle_f64_e64 s2, 0x7ff00000, |v[0:1]| +; GFX1200-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_cndmask_b32 v1, 0x7ff80000, v5 +; GFX1200-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo +; GFX1200-NEXT: global_store_b64 v6, v[0:1], s[0:1] ; GFX1200-NEXT: s_endpgm - ptr addrspace(1) %in2) #1 { - %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 - %r0 = load float, ptr addrspace(1) %in1, align 4 - %r1 = load float, ptr addrspace(1) %gep2, align 4 - %r2 = frem afn float %r0, %r1 - store float %r2, ptr addrspace(1) %out, align 4 + ptr addrspace(1) %in2) #0 { + %r0 = load double, ptr addrspace(1) %in1, align 8 + %r1 = load double, ptr addrspace(1) %in2, align 8 + %r2 = frem double %r0, %r1 + store double %r2, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, -; SI-LABEL: frem_f64: +define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; SI-LABEL: fast_frem_f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1601,7 +4329,7 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; CI-LABEL: frem_f64: +; CI-LABEL: fast_frem_f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd @@ -1636,7 +4364,7 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; CI-NEXT: s_endpgm ; -; VI-LABEL: frem_f64: +; VI-LABEL: fast_frem_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 @@ -1667,7 +4395,7 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; -; GFX9-LABEL: frem_f64: +; GFX9-LABEL: fast_frem_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 @@ -1693,7 +4421,7 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: frem_f64: +; GFX10-LABEL: fast_frem_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -1710,270 +4438,14 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] -; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] -; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] -; GFX10-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] -; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] -; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: frem_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v12, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v12, s[2:3] -; GFX11-NEXT: global_load_b64 v[2:3], v12, s[4:5] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] -; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] -; GFX11-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] -; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX11-NEXT: global_store_b64 v12, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm -; -; GFX1150-LABEL: frem_f64: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-NEXT: v_mov_b32_e32 v12, 0 -; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b64 v[0:1], v12, s[2:3] -; GFX1150-NEXT: global_load_b64 v[2:3], v12, s[4:5] -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX1150-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX1150-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX1150-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] -; GFX1150-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] -; GFX1150-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] -; GFX1150-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX1150-NEXT: global_store_b64 v12, v[0:1], s[0:1] -; GFX1150-NEXT: s_endpgm -; -; GFX1200-LABEL: frem_f64: -; GFX1200: ; %bb.0: -; GFX1200-NEXT: s_clause 0x1 -; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1200-NEXT: v_mov_b32_e32 v12, 0 -; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: s_clause 0x1 -; GFX1200-NEXT: global_load_b64 v[0:1], v12, s[2:3] -; GFX1200-NEXT: global_load_b64 v[2:3], v12, s[4:5] -; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1200-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX1200-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX1200-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX1200-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] -; GFX1200-NEXT: v_mul_f64_e32 v[10:11], v[8:9], v[6:7] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] -; GFX1200-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] -; GFX1200-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX1200-NEXT: global_store_b64 v12, v[0:1], s[0:1] -; GFX1200-NEXT: s_endpgm - ptr addrspace(1) %in2) #0 { - %r0 = load double, ptr addrspace(1) %in1, align 8 - %r1 = load double, ptr addrspace(1) %in2, align 8 - %r2 = frem double %r0, %r1 - store double %r2, ptr addrspace(1) %out, align 8 - ret void -} - -define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, -; SI-LABEL: fast_frem_f64: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s8 -; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] -; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] -; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] -; SI-NEXT: v_readfirstlane_b32 s4, v4 -; SI-NEXT: v_readfirstlane_b32 s5, v5 -; SI-NEXT: s_bfe_u32 s6, s5, 0xb0014 -; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01 -; SI-NEXT: s_mov_b32 s7, 0xfffff -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s8 -; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] -; SI-NEXT: s_and_b32 s9, s5, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s8, 0 -; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s9, s7 -; SI-NEXT: s_cmp_gt_i32 s8, 51 -; SI-NEXT: s_cselect_b32 s5, s5, s7 -; SI-NEXT: s_cselect_b32 s4, s4, s6 -; SI-NEXT: v_fma_f64 v[0:1], -s[4:5], v[2:3], v[0:1] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SI-NEXT: s_endpgm -; -; CI-LABEL: fast_frem_f64: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s6, s10 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s0 -; CI-NEXT: s_mov_b32 s9, s1 -; CI-NEXT: s_mov_b32 s0, s2 -; CI-NEXT: s_mov_b32 s1, s3 -; CI-NEXT: s_mov_b32 s2, s10 -; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: s_mov_b32 s7, s11 -; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] -; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] -; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] -; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] -; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 -; CI-NEXT: s_endpgm -; -; VI-LABEL: fast_frem_f64: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] -; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] -; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] -; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] -; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] -; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] -; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fast_frem_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[2:3] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] -; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] -; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] -; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: fast_frem_f64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v10, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[6:7] -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] -; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX10-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fast_frem_f64: @@ -1981,28 +4453,32 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v10, 0 +; GFX11-NEXT: v_mov_b32_e32 v12, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v10, s[2:3] -; GFX11-NEXT: global_load_b64 v[2:3], v10, s[4:5] +; GFX11-NEXT: global_load_b64 v[0:1], v12, s[2:3] +; GFX11-NEXT: global_load_b64 v[2:3], v12, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; GFX11-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; GFX11-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] -; GFX11-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX11-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX11-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v12, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: fast_frem_f64: @@ -2010,28 +4486,32 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-NEXT: s_clause 0x1 ; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-NEXT: v_mov_b32_e32 v10, 0 +; GFX1150-NEXT: v_mov_b32_e32 v12, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[2:3] -; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[4:5] +; GFX1150-NEXT: global_load_b64 v[0:1], v12, s[2:3] +; GFX1150-NEXT: global_load_b64 v[2:3], v12, s[4:5] ; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX1150-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX1150-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX1150-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX1150-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX1150-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX1150-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX1150-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX1150-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] -; GFX1150-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; GFX1150-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX1150-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; GFX1150-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] ; GFX1150-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX1150-NEXT: global_store_b64 v12, v[0:1], s[0:1] ; GFX1150-NEXT: s_endpgm ; ; GFX1200-LABEL: fast_frem_f64: @@ -2039,28 +4519,32 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1200-NEXT: s_clause 0x1 ; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1200-NEXT: v_mov_b32_e32 v10, 0 +; GFX1200-NEXT: v_mov_b32_e32 v12, 0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 ; GFX1200-NEXT: s_clause 0x1 -; GFX1200-NEXT: global_load_b64 v[0:1], v10, s[2:3] -; GFX1200-NEXT: global_load_b64 v[2:3], v10, s[4:5] +; GFX1200-NEXT: global_load_b64 v[0:1], v12, s[2:3] +; GFX1200-NEXT: global_load_b64 v[2:3], v12, s[4:5] ; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX1200-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX1200-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX1200-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX1200-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX1200-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX1200-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX1200-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX1200-NEXT: v_mul_f64_e32 v[10:11], v[8:9], v[6:7] ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_mul_f64_e32 v[6:7], v[0:1], v[4:5] -; GFX1200-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; GFX1200-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX1200-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; GFX1200-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] ; GFX1200-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX1200-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX1200-NEXT: global_store_b64 v12, v[0:1], s[0:1] ; GFX1200-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %r0 = load double, ptr addrspace(1) %in1, align 8 @@ -2073,47 +4557,54 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: unsafe_frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s8 -; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] -; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] -; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] -; SI-NEXT: v_readfirstlane_b32 s4, v4 -; SI-NEXT: v_readfirstlane_b32 s5, v5 -; SI-NEXT: s_bfe_u32 s6, s5, 0xb0014 -; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01 -; SI-NEXT: s_mov_b32 s7, 0xfffff -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s8 -; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] -; SI-NEXT: s_and_b32 s9, s5, 0x80000000 +; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] +; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1] +; SI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; SI-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v9 +; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc +; SI-NEXT: s_nop 1 +; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11] +; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] +; SI-NEXT: v_readfirstlane_b32 s0, v4 +; SI-NEXT: v_readfirstlane_b32 s1, v5 +; SI-NEXT: s_bfe_u32 s2, s1, 0xb0014 +; SI-NEXT: s_add_i32 s8, s2, 0xfffffc01 +; SI-NEXT: s_mov_b32 s3, 0xfffff +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s8 +; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3] +; SI-NEXT: s_and_b32 s9, s1, 0x80000000 ; SI-NEXT: s_cmp_lt_i32 s8, 0 -; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s9, s7 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_cselect_b32 s3, s9, s3 ; SI-NEXT: s_cmp_gt_i32 s8, 51 -; SI-NEXT: s_cselect_b32 s5, s5, s7 -; SI-NEXT: s_cselect_b32 s4, s4, s6 -; SI-NEXT: v_fma_f64 v[0:1], -s[4:5], v[2:3], v[0:1] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_cselect_b32 s1, s1, s3 +; SI-NEXT: s_cselect_b32 s0, s0, s2 +; SI-NEXT: v_fma_f64 v[0:1], -s[0:1], v[2:3], v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: unsafe_frem_f64: @@ -2134,14 +4625,18 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] -; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] -; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] +; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; CI-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] +; CI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; CI-NEXT: s_nop 1 +; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 @@ -2161,14 +4656,18 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] -; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] -; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] -; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] -; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] +; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3] +; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3] +; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; VI-NEXT: s_nop 1 +; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] +; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3] ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] ; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -2178,22 +4677,26 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[2:3] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v12, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] -; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; GFX9-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] +; GFX9-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX9-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] ; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: unsafe_frem_f64: @@ -2201,23 +4704,26 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] -; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; GFX10-NEXT: v_div_scale_f64 v[4:5], s2, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX10-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: unsafe_frem_f64: @@ -2225,28 +4731,32 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v10, 0 +; GFX11-NEXT: v_mov_b32_e32 v12, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v10, s[2:3] -; GFX11-NEXT: global_load_b64 v[2:3], v10, s[4:5] +; GFX11-NEXT: global_load_b64 v[0:1], v12, s[2:3] +; GFX11-NEXT: global_load_b64 v[2:3], v12, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] -; GFX11-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; GFX11-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] -; GFX11-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX11-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX11-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v12, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: unsafe_frem_f64: @@ -2254,28 +4764,32 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-NEXT: s_clause 0x1 ; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-NEXT: v_mov_b32_e32 v10, 0 +; GFX1150-NEXT: v_mov_b32_e32 v12, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[2:3] -; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[4:5] +; GFX1150-NEXT: global_load_b64 v[0:1], v12, s[2:3] +; GFX1150-NEXT: global_load_b64 v[2:3], v12, s[4:5] ; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX1150-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX1150-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX1150-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX1150-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX1150-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX1150-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX1150-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX1150-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] -; GFX1150-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; GFX1150-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX1150-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; GFX1150-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] ; GFX1150-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX1150-NEXT: global_store_b64 v12, v[0:1], s[0:1] ; GFX1150-NEXT: s_endpgm ; ; GFX1200-LABEL: unsafe_frem_f64: @@ -2283,28 +4797,32 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX1200-NEXT: s_clause 0x1 ; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1200-NEXT: v_mov_b32_e32 v10, 0 +; GFX1200-NEXT: v_mov_b32_e32 v12, 0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 ; GFX1200-NEXT: s_clause 0x1 -; GFX1200-NEXT: global_load_b64 v[0:1], v10, s[2:3] -; GFX1200-NEXT: global_load_b64 v[2:3], v10, s[4:5] +; GFX1200-NEXT: global_load_b64 v[0:1], v12, s[2:3] +; GFX1200-NEXT: global_load_b64 v[2:3], v12, s[4:5] ; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX1200-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX1200-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX1200-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX1200-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX1200-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX1200-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX1200-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX1200-NEXT: v_mul_f64_e32 v[10:11], v[8:9], v[6:7] ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_mul_f64_e32 v[6:7], v[0:1], v[4:5] -; GFX1200-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; GFX1200-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX1200-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; GFX1200-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] ; GFX1200-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX1200-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX1200-NEXT: global_store_b64 v12, v[0:1], s[0:1] ; GFX1200-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { %r0 = load double, ptr addrspace(1) %in1, align 8 @@ -2318,127 +4836,427 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-LABEL: frem_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s8 -; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s10 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 -; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 -; SI-NEXT: v_rcp_f32_e32 v6, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v2 +; SI-NEXT: v_and_b32_e32 v6, 0x7fffffff, v3 +; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v3| +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: s_cbranch_vccz .LBB9_2 +; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: v_bfi_b32 v7, s0, 0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v6 +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB9_3 +; SI-NEXT: s_branch .LBB9_8 +; SI-NEXT: .LBB9_2: +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB9_3: ; %frem.compute +; SI-NEXT: s_mov_b32 s3, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v5 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s0, v4 +; SI-NEXT: s_cselect_b32 s2, s0, 0 +; SI-NEXT: v_frexp_mant_f32_e32 v4, v5 +; SI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; SI-NEXT: v_ldexp_f32_e64 v5, v4, 11 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s3 +; SI-NEXT: v_frexp_mant_f32_e32 v4, v6 +; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; SI-NEXT: v_frexp_exp_i32_f32_e32 v6, v6 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s0, v6 +; SI-NEXT: s_cselect_b32 s3, s0, 0 +; SI-NEXT: s_add_i32 s0, s3, -1 +; SI-NEXT: v_ldexp_f32_e64 v4, v4, 1 +; SI-NEXT: s_not_b32 s1, s0 +; SI-NEXT: s_add_i32 s1, s1, s2 +; SI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v4, 1.0 +; SI-NEXT: v_div_scale_f32 v7, s[4:5], v4, v4, 1.0 +; SI-NEXT: v_rcp_f32_e32 v8, v7 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; SI-NEXT: v_fma_f32 v6, v7, v6, v6 -; SI-NEXT: v_mul_f32_e32 v7, v4, v6 -; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 -; SI-NEXT: v_fma_f32 v7, v8, v6, v7 -; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 +; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; SI-NEXT: v_fma_f32 v8, v9, v8, v8 +; SI-NEXT: v_mul_f32_e32 v9, v6, v8 +; SI-NEXT: v_fma_f32 v10, -v7, v9, v6 +; SI-NEXT: v_fma_f32 v9, v10, v8, v9 +; SI-NEXT: v_fma_f32 v6, -v7, v9, v6 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 -; SI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 -; SI-NEXT: v_trunc_f32_e32 v4, v4 -; SI-NEXT: v_fma_f32 v0, -v4, v2, v0 -; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 -; SI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 -; SI-NEXT: v_rcp_f32_e32 v5, v4 +; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 +; SI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; SI-NEXT: s_cmp_lt_i32 s1, 12 +; SI-NEXT: s_cbranch_scc1 .LBB9_7 +; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: s_sub_i32 s1, s2, s3 +; SI-NEXT: s_add_i32 s1, s1, 11 +; SI-NEXT: .LBB9_5: ; %frem.loop_body +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v7, v5 +; SI-NEXT: v_mul_f32_e32 v5, v7, v6 +; SI-NEXT: v_rndne_f32_e32 v5, v5 +; SI-NEXT: v_fma_f32 v5, -v5, v4, v7 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; SI-NEXT: v_add_f32_e32 v8, v5, v4 +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; SI-NEXT: v_ldexp_f32_e64 v5, v5, 11 +; SI-NEXT: s_add_i32 s1, s1, -11 +; SI-NEXT: s_cmp_gt_i32 s1, 11 +; SI-NEXT: s_cbranch_scc1 .LBB9_5 +; SI-NEXT: ; %bb.6: ; %Flow55 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: .LBB9_7: ; %frem.loop_exit +; SI-NEXT: s_add_i32 s1, s1, -10 +; SI-NEXT: v_ldexp_f32_e64 v5, v5, s1 +; SI-NEXT: v_mul_f32_e32 v6, v5, v6 +; SI-NEXT: v_rndne_f32_e32 v6, v6 +; SI-NEXT: v_fma_f32 v5, -v6, v4, v5 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; SI-NEXT: v_add_f32_e32 v4, v5, v4 +; SI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; SI-NEXT: v_ldexp_f32_e64 v4, v4, s0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_bfi_b32 v4, s0, v4, v2 +; SI-NEXT: .LBB9_8: +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 +; SI-NEXT: v_cvt_f32_f16_e64 v6, |v5| +; SI-NEXT: v_cvt_f32_f16_e64 v7, |v7| +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v7 +; SI-NEXT: s_cbranch_vccz .LBB9_10 +; SI-NEXT: ; %bb.9: ; %frem.else20 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_bfi_b32 v8, s0, 0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v7 +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB9_11 +; SI-NEXT: s_branch .LBB9_16 +; SI-NEXT: .LBB9_10: +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB9_11: ; %frem.compute19 +; SI-NEXT: s_mov_b32 s3, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s3 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v5, v6 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s0, v5 +; SI-NEXT: s_cselect_b32 s2, s0, 0 +; SI-NEXT: v_frexp_mant_f32_e32 v5, v6 +; SI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; SI-NEXT: v_ldexp_f32_e64 v6, v5, 11 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s3 +; SI-NEXT: v_frexp_mant_f32_e32 v5, v7 +; SI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; SI-NEXT: v_frexp_exp_i32_f32_e32 v7, v7 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s0, v7 +; SI-NEXT: s_cselect_b32 s3, s0, 0 +; SI-NEXT: s_add_i32 s0, s3, -1 +; SI-NEXT: v_ldexp_f32_e64 v5, v5, 1 +; SI-NEXT: s_not_b32 s1, s0 +; SI-NEXT: s_add_i32 s1, s1, s2 +; SI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v5, 1.0 +; SI-NEXT: v_div_scale_f32 v8, s[4:5], v5, v5, 1.0 +; SI-NEXT: v_rcp_f32_e32 v9, v8 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 -; SI-NEXT: v_fma_f32 v5, v6, v5, v5 -; SI-NEXT: v_mul_f32_e32 v6, v2, v5 -; SI-NEXT: v_fma_f32 v7, -v4, v6, v2 -; SI-NEXT: v_fma_f32 v6, v7, v5, v6 -; SI-NEXT: v_fma_f32 v2, -v4, v6, v2 +; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 +; SI-NEXT: v_fma_f32 v9, v10, v9, v9 +; SI-NEXT: v_mul_f32_e32 v10, v7, v9 +; SI-NEXT: v_fma_f32 v11, -v8, v10, v7 +; SI-NEXT: v_fma_f32 v10, v11, v9, v10 +; SI-NEXT: v_fma_f32 v7, -v8, v10, v7 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 -; SI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 -; SI-NEXT: v_trunc_f32_e32 v2, v2 -; SI-NEXT: v_fma_f32 v1, -v2, v3, v1 +; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 +; SI-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 +; SI-NEXT: s_cmp_lt_i32 s1, 12 +; SI-NEXT: s_cbranch_scc1 .LBB9_15 +; SI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; SI-NEXT: s_sub_i32 s1, s2, s3 +; SI-NEXT: s_add_i32 s1, s1, 11 +; SI-NEXT: .LBB9_13: ; %frem.loop_body27 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v8, v6 +; SI-NEXT: v_mul_f32_e32 v6, v8, v7 +; SI-NEXT: v_rndne_f32_e32 v6, v6 +; SI-NEXT: v_fma_f32 v6, -v6, v5, v8 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; SI-NEXT: v_add_f32_e32 v9, v6, v5 +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; SI-NEXT: v_ldexp_f32_e64 v6, v6, 11 +; SI-NEXT: s_add_i32 s1, s1, -11 +; SI-NEXT: s_cmp_gt_i32 s1, 11 +; SI-NEXT: s_cbranch_scc1 .LBB9_13 +; SI-NEXT: ; %bb.14: ; %Flow +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: .LBB9_15: ; %frem.loop_exit28 +; SI-NEXT: s_add_i32 s1, s1, -10 +; SI-NEXT: v_ldexp_f32_e64 v6, v6, s1 +; SI-NEXT: v_mul_f32_e32 v7, v6, v7 +; SI-NEXT: v_rndne_f32_e32 v7, v7 +; SI-NEXT: v_fma_f32 v6, -v7, v5, v6 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; SI-NEXT: v_add_f32_e32 v5, v6, v5 +; SI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; SI-NEXT: v_ldexp_f32_e64 v5, v5, s0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_bfi_b32 v5, s0, v5, v0 +; SI-NEXT: .LBB9_16: ; %Flow54 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; SI-NEXT: s_movk_i32 s2, 0x7c00 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v2 +; SI-NEXT: s_or_b64 vcc, s[0:1], vcc +; SI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v0 +; SI-NEXT: s_or_b64 vcc, s[0:1], vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s6, s2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s2, s6 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s8 -; CI-NEXT: s_mov_b32 s1, s9 -; CI-NEXT: s_mov_b32 s8, s10 -; CI-NEXT: s_mov_b32 s9, s11 -; CI-NEXT: s_mov_b32 s10, s2 -; CI-NEXT: s_mov_b32 s11, s3 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; CI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; CI-NEXT: s_mov_b32 s4, s10 +; CI-NEXT: s_mov_b32 s5, s11 +; CI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:16 +; CI-NEXT: s_brev_b32 s0, -2 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 -; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 -; CI-NEXT: v_rcp_f32_e32 v6, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v3| +; CI-NEXT: v_and_b32_e32 v6, 0x7fffffff, v2 +; CI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v3 +; CI-NEXT: s_and_b64 vcc, exec, s[2:3] +; CI-NEXT: s_cbranch_vccz .LBB9_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_bfi_b32 v7, s0, 0, v2 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; CI-NEXT: s_cbranch_execz .LBB9_3 +; CI-NEXT: s_branch .LBB9_8 +; CI-NEXT: .LBB9_2: +; CI-NEXT: ; implicit-def: $vgpr4 +; CI-NEXT: .LBB9_3: ; %frem.compute +; CI-NEXT: v_frexp_exp_i32_f32_e32 v9, v6 +; CI-NEXT: v_frexp_mant_f32_e32 v4, v6 +; CI-NEXT: v_frexp_mant_f32_e32 v6, v5 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v10, v5 +; CI-NEXT: v_ldexp_f32_e64 v5, v6, 1 +; CI-NEXT: v_div_scale_f32 v11, s[0:1], v5, v5, 1.0 +; CI-NEXT: v_ldexp_f32_e64 v7, v4, 11 +; CI-NEXT: v_add_i32_e32 v4, vcc, -1, v10 +; CI-NEXT: v_not_b32_e32 v6, v4 +; CI-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v5, 1.0 +; CI-NEXT: v_rcp_f32_e32 v12, v11 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; CI-NEXT: v_fma_f32 v6, v7, v6, v6 -; CI-NEXT: v_mul_f32_e32 v7, v4, v6 -; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 -; CI-NEXT: v_fma_f32 v7, v8, v6, v7 -; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 +; CI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 +; CI-NEXT: v_fma_f32 v12, v13, v12, v12 +; CI-NEXT: v_mul_f32_e32 v13, v8, v12 +; CI-NEXT: v_fma_f32 v14, -v11, v13, v8 +; CI-NEXT: v_fma_f32 v13, v14, v12, v13 +; CI-NEXT: v_fma_f32 v8, -v11, v13, v8 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 -; CI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 -; CI-NEXT: v_trunc_f32_e32 v4, v4 -; CI-NEXT: v_fma_f32 v0, -v4, v2, v0 -; CI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 -; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 -; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_rcp_f32_e32 v5, v4 +; CI-NEXT: v_div_fmas_f32 v8, v8, v12, v13 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6 +; CI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB9_7 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_sub_i32_e32 v6, vcc, v9, v10 +; CI-NEXT: v_add_i32_e32 v6, vcc, 11, v6 +; CI-NEXT: .LBB9_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v9, v7 +; CI-NEXT: v_mul_f32_e32 v7, v9, v8 +; CI-NEXT: v_rndne_f32_e32 v7, v7 +; CI-NEXT: v_fma_f32 v7, -v7, v5, v9 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; CI-NEXT: v_add_f32_e32 v10, v7, v5 +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; CI-NEXT: v_add_i32_e32 v6, vcc, -11, v6 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v6 +; CI-NEXT: v_ldexp_f32_e64 v7, v7, 11 +; CI-NEXT: s_cbranch_vccnz .LBB9_5 +; CI-NEXT: ; %bb.6: ; %Flow55 +; CI-NEXT: v_mov_b32_e32 v7, v9 +; CI-NEXT: .LBB9_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v6, vcc, -10, v6 +; CI-NEXT: v_ldexp_f32_e32 v6, v7, v6 +; CI-NEXT: v_mul_f32_e32 v7, v6, v8 +; CI-NEXT: v_rndne_f32_e32 v7, v7 +; CI-NEXT: v_fma_f32 v6, -v7, v5, v6 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; CI-NEXT: v_add_f32_e32 v5, v6, v5 +; CI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; CI-NEXT: v_ldexp_f32_e32 v4, v5, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: s_brev_b32 s0, -2 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_bfi_b32 v4, s0, v4, v2 +; CI-NEXT: .LBB9_8: +; CI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; CI-NEXT: v_cvt_f32_f16_e64 v7, |v5| +; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 +; CI-NEXT: s_cbranch_vccz .LBB9_10 +; CI-NEXT: ; %bb.9: ; %frem.else20 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: s_brev_b32 s0, -2 +; CI-NEXT: v_bfi_b32 v8, s0, 0, v0 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v7, v6 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; CI-NEXT: s_cbranch_execz .LBB9_11 +; CI-NEXT: s_branch .LBB9_16 +; CI-NEXT: .LBB9_10: +; CI-NEXT: ; implicit-def: $vgpr5 +; CI-NEXT: .LBB9_11: ; %frem.compute19 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v10, v7 +; CI-NEXT: v_frexp_mant_f32_e32 v5, v7 +; CI-NEXT: v_frexp_mant_f32_e32 v7, v6 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v11, v6 +; CI-NEXT: v_ldexp_f32_e64 v6, v7, 1 +; CI-NEXT: v_div_scale_f32 v12, s[0:1], v6, v6, 1.0 +; CI-NEXT: v_ldexp_f32_e64 v8, v5, 11 +; CI-NEXT: v_add_i32_e32 v5, vcc, -1, v11 +; CI-NEXT: v_not_b32_e32 v7, v5 +; CI-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v6, 1.0 +; CI-NEXT: v_rcp_f32_e32 v13, v12 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 -; CI-NEXT: v_fma_f32 v5, v6, v5, v5 -; CI-NEXT: v_mul_f32_e32 v6, v2, v5 -; CI-NEXT: v_fma_f32 v7, -v4, v6, v2 -; CI-NEXT: v_fma_f32 v6, v7, v5, v6 -; CI-NEXT: v_fma_f32 v2, -v4, v6, v2 +; CI-NEXT: v_fma_f32 v14, -v12, v13, 1.0 +; CI-NEXT: v_fma_f32 v13, v14, v13, v13 +; CI-NEXT: v_mul_f32_e32 v14, v9, v13 +; CI-NEXT: v_fma_f32 v15, -v12, v14, v9 +; CI-NEXT: v_fma_f32 v14, v15, v13, v14 +; CI-NEXT: v_fma_f32 v9, -v12, v14, v9 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 -; CI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v1, -v2, v3, v1 +; CI-NEXT: v_div_fmas_f32 v9, v9, v13, v14 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v7 +; CI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB9_15 +; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; CI-NEXT: v_sub_i32_e32 v7, vcc, v10, v11 +; CI-NEXT: v_add_i32_e32 v7, vcc, 11, v7 +; CI-NEXT: .LBB9_13: ; %frem.loop_body27 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v10, v8 +; CI-NEXT: v_mul_f32_e32 v8, v10, v9 +; CI-NEXT: v_rndne_f32_e32 v8, v8 +; CI-NEXT: v_fma_f32 v8, -v8, v6, v10 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v8 +; CI-NEXT: v_add_f32_e32 v11, v8, v6 +; CI-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; CI-NEXT: v_add_i32_e32 v7, vcc, -11, v7 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v7 +; CI-NEXT: v_ldexp_f32_e64 v8, v8, 11 +; CI-NEXT: s_cbranch_vccnz .LBB9_13 +; CI-NEXT: ; %bb.14: ; %Flow +; CI-NEXT: v_mov_b32_e32 v8, v10 +; CI-NEXT: .LBB9_15: ; %frem.loop_exit28 +; CI-NEXT: v_add_i32_e32 v7, vcc, -10, v7 +; CI-NEXT: v_ldexp_f32_e32 v7, v8, v7 +; CI-NEXT: v_mul_f32_e32 v8, v7, v9 +; CI-NEXT: v_rndne_f32_e32 v8, v8 +; CI-NEXT: v_fma_f32 v7, -v8, v6, v7 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; CI-NEXT: v_add_f32_e32 v6, v7, v6 +; CI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; CI-NEXT: v_ldexp_f32_e32 v5, v6, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: s_brev_b32 s0, -2 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_bfi_b32 v5, s0, v5, v0 +; CI-NEXT: .LBB9_16: ; %Flow54 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_or_b32_e32 v0, v1, v0 -; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: s_movk_i32 s2, 0x7c00 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; CI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v2 +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; CI-NEXT: s_or_b64 vcc, s[0:1], vcc +; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; CI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; CI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v0 +; CI-NEXT: s_or_b64 vcc, s[0:1], vcc +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_or_b32_e32 v0, v3, v0 +; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v2f16: @@ -2446,50 +5264,171 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_add_u32 s0, s4, 16 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: flat_load_dword v4, v[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s2, s4, 16 +; VI-NEXT: s_addc_u32 s3, s5, 0 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: flat_load_dword v1, v[1:2] ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; VI-NEXT: v_cvt_f32_f16_e32 v5, v3 +; VI-NEXT: v_cvt_f32_f16_e64 v4, |v0| ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; VI-NEXT: v_rcp_f32_e32 v8, v7 -; VI-NEXT: v_mul_f32_e32 v9, v5, v8 -; VI-NEXT: v_mad_f32 v10, -v7, v9, v5 -; VI-NEXT: v_mac_f32_e32 v9, v10, v8 -; VI-NEXT: v_mad_f32 v5, -v7, v9, v5 -; VI-NEXT: v_mul_f32_e32 v5, v5, v8 -; VI-NEXT: v_and_b32_e32 v5, 0xff800000, v5 -; VI-NEXT: v_add_f32_e32 v5, v5, v9 -; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; VI-NEXT: v_div_fixup_f16 v5, v5, v6, v3 -; VI-NEXT: v_trunc_f16_e32 v5, v5 -; VI-NEXT: v_fma_f16 v3, -v5, v6, v3 -; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; VI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_rcp_f32_e32 v7, v6 -; VI-NEXT: v_mul_f32_e32 v8, v5, v7 -; VI-NEXT: v_mad_f32 v9, -v6, v8, v5 -; VI-NEXT: v_mac_f32_e32 v8, v9, v7 -; VI-NEXT: v_mad_f32 v5, -v6, v8, v5 -; VI-NEXT: v_mul_f32_e32 v5, v5, v7 -; VI-NEXT: v_and_b32_e32 v5, 0xff800000, v5 -; VI-NEXT: v_add_f32_e32 v5, v5, v8 +; VI-NEXT: v_cvt_f32_f16_e64 v3, |v1| +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 +; VI-NEXT: s_cbranch_vccz .LBB9_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: v_bfi_b32 v2, s2, 0, v0 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; VI-NEXT: s_cbranch_execz .LBB9_3 +; VI-NEXT: s_branch .LBB9_8 +; VI-NEXT: .LBB9_2: +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: .LBB9_3: ; %frem.compute +; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 +; VI-NEXT: v_frexp_mant_f32_e32 v2, v4 +; VI-NEXT: v_frexp_mant_f32_e32 v4, v3 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3 +; VI-NEXT: v_ldexp_f32 v3, v4, 1 +; VI-NEXT: v_div_scale_f32 v9, s[2:3], v3, v3, 1.0 +; VI-NEXT: v_ldexp_f32 v5, v2, 11 +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v8 +; VI-NEXT: v_not_b32_e32 v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v7 +; VI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 +; VI-NEXT: v_rcp_f32_e32 v10, v9 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; VI-NEXT: v_fma_f32 v10, v11, v10, v10 +; VI-NEXT: v_mul_f32_e32 v11, v6, v10 +; VI-NEXT: v_fma_f32 v12, -v9, v11, v6 +; VI-NEXT: v_fma_f32 v11, v12, v10, v11 +; VI-NEXT: v_fma_f32 v6, -v9, v11, v6 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v4 +; VI-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB9_7 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_sub_u32_e32 v4, vcc, v7, v8 +; VI-NEXT: v_add_u32_e32 v4, vcc, 11, v4 +; VI-NEXT: .LBB9_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mul_f32_e32 v5, v7, v6 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v5, -v5, v3, v7 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; VI-NEXT: v_add_f32_e32 v8, v5, v3 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, -11, v4 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v4 +; VI-NEXT: v_ldexp_f32 v5, v5, 11 +; VI-NEXT: s_cbranch_vccnz .LBB9_5 +; VI-NEXT: ; %bb.6: ; %Flow55 +; VI-NEXT: v_mov_b32_e32 v5, v7 +; VI-NEXT: .LBB9_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v4, vcc, -10, v4 +; VI-NEXT: v_ldexp_f32 v4, v5, v4 +; VI-NEXT: v_mul_f32_e32 v5, v4, v6 +; VI-NEXT: v_rndne_f32_e32 v5, v5 +; VI-NEXT: v_fma_f32 v4, -v5, v3, v4 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; VI-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; VI-NEXT: v_ldexp_f32 v2, v3, v2 +; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: v_bfi_b32 v2, s2, v2, v0 +; VI-NEXT: .LBB9_8: +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_cvt_f32_f16_e64 v7, |v3| +; VI-NEXT: v_cvt_f32_f16_e64 v6, |v4| +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 +; VI-NEXT: s_cbranch_vccz .LBB9_10 +; VI-NEXT: ; %bb.9: ; %frem.else20 +; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: v_bfi_b32 v5, s2, 0, v3 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v7, v6 +; VI-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc +; VI-NEXT: s_cbranch_execz .LBB9_11 +; VI-NEXT: s_branch .LBB9_16 +; VI-NEXT: .LBB9_10: +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: .LBB9_11: ; %frem.compute19 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v10, v7 +; VI-NEXT: v_frexp_mant_f32_e32 v5, v7 +; VI-NEXT: v_frexp_mant_f32_e32 v7, v6 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v11, v6 +; VI-NEXT: v_ldexp_f32 v6, v7, 1 +; VI-NEXT: v_div_scale_f32 v12, s[2:3], v6, v6, 1.0 +; VI-NEXT: v_ldexp_f32 v8, v5, 11 +; VI-NEXT: v_add_u32_e32 v5, vcc, -1, v11 +; VI-NEXT: v_not_b32_e32 v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v10 +; VI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v6, 1.0 +; VI-NEXT: v_rcp_f32_e32 v13, v12 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v14, -v12, v13, 1.0 +; VI-NEXT: v_fma_f32 v13, v14, v13, v13 +; VI-NEXT: v_mul_f32_e32 v14, v9, v13 +; VI-NEXT: v_fma_f32 v15, -v12, v14, v9 +; VI-NEXT: v_fma_f32 v14, v15, v13, v14 +; VI-NEXT: v_fma_f32 v9, -v12, v14, v9 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v9, v9, v13, v14 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v7 +; VI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB9_15 +; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; VI-NEXT: v_sub_u32_e32 v7, vcc, v10, v11 +; VI-NEXT: v_add_u32_e32 v7, vcc, 11, v7 +; VI-NEXT: .LBB9_13: ; %frem.loop_body27 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v10, v8 +; VI-NEXT: v_mul_f32_e32 v8, v10, v9 +; VI-NEXT: v_rndne_f32_e32 v8, v8 +; VI-NEXT: v_fma_f32 v8, -v8, v6, v10 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v8 +; VI-NEXT: v_add_f32_e32 v11, v8, v6 +; VI-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, -11, v7 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v7 +; VI-NEXT: v_ldexp_f32 v8, v8, 11 +; VI-NEXT: s_cbranch_vccnz .LBB9_13 +; VI-NEXT: ; %bb.14: ; %Flow +; VI-NEXT: v_mov_b32_e32 v8, v10 +; VI-NEXT: .LBB9_15: ; %frem.loop_exit28 +; VI-NEXT: v_add_u32_e32 v7, vcc, -10, v7 +; VI-NEXT: v_ldexp_f32 v7, v8, v7 +; VI-NEXT: v_mul_f32_e32 v8, v7, v9 +; VI-NEXT: v_rndne_f32_e32 v8, v8 +; VI-NEXT: v_fma_f32 v7, -v8, v6, v7 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; VI-NEXT: v_add_f32_e32 v6, v7, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; VI-NEXT: v_ldexp_f32 v5, v6, v5 ; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; VI-NEXT: v_div_fixup_f16 v5, v5, v2, v4 -; VI-NEXT: v_trunc_f16_e32 v5, v5 -; VI-NEXT: v_fma_f16 v2, -v5, v2, v4 -; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: v_bfi_b32 v5, s2, v5, v3 +; VI-NEXT: .LBB9_16: ; %Flow54 +; VI-NEXT: s_movk_i32 s4, 0x7c00 +; VI-NEXT: v_cmp_lg_f16_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_nge_f16_e64 s[2:3], |v0|, s4 +; VI-NEXT: s_and_b64 vcc, s[2:3], vcc +; VI-NEXT: v_mov_b32_e32 v6, 0x7e00 +; VI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_cmp_lg_f16_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_nge_f16_e64 s[0:1], |v3|, s4 +; VI-NEXT: s_and_b64 vcc, s[0:1], vcc +; VI-NEXT: v_cndmask_b32_sdwa v3, v6, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2497,44 +5436,167 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:16 +; GFX9-NEXT: global_load_dword v1, v2, s[2:3] +; GFX9-NEXT: global_load_dword v0, v2, s[6:7] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX9-NEXT: v_cvt_f32_f16_e64 v4, |v1| ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_rcp_f32_e32 v7, v7 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX9-NEXT: v_mac_f32_e32 v3, v5, v4 -; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX9-NEXT: v_cvt_f32_f16_e64 v3, |v0| +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 +; GFX9-NEXT: s_cbranch_vccz .LBB9_2 +; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: v_bfi_b32 v2, s2, 0, v1 +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GFX9-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-NEXT: s_branch .LBB9_8 +; GFX9-NEXT: .LBB9_2: +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: .LBB9_3: ; %frem.compute +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 +; GFX9-NEXT: v_frexp_mant_f32_e32 v2, v4 +; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v3 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v8, v3 +; GFX9-NEXT: v_ldexp_f32 v3, v4, 1 +; GFX9-NEXT: v_div_scale_f32 v9, s[2:3], v3, v3, 1.0 +; GFX9-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 +; GFX9-NEXT: v_ldexp_f32 v5, v2, 11 +; GFX9-NEXT: v_add_u32_e32 v2, -1, v8 +; GFX9-NEXT: v_not_b32_e32 v4, v2 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 +; GFX9-NEXT: v_rcp_f32_e32 v10, v9 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; GFX9-NEXT: v_fma_f32 v10, v11, v10, v10 +; GFX9-NEXT: v_mul_f32_e32 v11, v6, v10 +; GFX9-NEXT: v_fma_f32 v12, -v9, v11, v6 +; GFX9-NEXT: v_fma_f32 v11, v12, v10, v11 +; GFX9-NEXT: v_fma_f32 v6, -v9, v11, v6 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX9-NEXT: v_div_fmas_f32 v6, v6, v10, v11 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v4 +; GFX9-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB9_7 +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: v_sub_u32_e32 v4, v7, v8 +; GFX9-NEXT: v_add_u32_e32 v4, 11, v4 +; GFX9-NEXT: .LBB9_5: ; %frem.loop_body +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mul_f32_e32 v5, v7, v6 +; GFX9-NEXT: v_rndne_f32_e32 v5, v5 +; GFX9-NEXT: v_fma_f32 v5, -v5, v3, v7 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; GFX9-NEXT: v_add_f32_e32 v8, v5, v3 +; GFX9-NEXT: v_add_u32_e32 v4, -11, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 11, v4 +; GFX9-NEXT: v_ldexp_f32 v5, v5, 11 +; GFX9-NEXT: s_cbranch_vccnz .LBB9_5 +; GFX9-NEXT: ; %bb.6: ; %Flow55 +; GFX9-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX9-NEXT: v_add_u32_e32 v4, -10, v4 +; GFX9-NEXT: v_ldexp_f32 v4, v5, v4 +; GFX9-NEXT: v_mul_f32_e32 v5, v4, v6 +; GFX9-NEXT: v_rndne_f32_e32 v5, v5 +; GFX9-NEXT: v_fma_f32 v4, -v5, v3, v4 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 ; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v5, v5, v7 -; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 -; GFX9-NEXT: v_mad_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX9-NEXT: v_trunc_f16_e32 v3, v3 -; GFX9-NEXT: v_mac_f32_e32 v5, v8, v7 -; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v1 -; GFX9-NEXT: v_mad_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v7 -; GFX9-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_div_fixup_f16 v1, v1, v6, v4 -; GFX9-NEXT: v_trunc_f16_e32 v1, v1 -; GFX9-NEXT: v_fma_f16 v1, -v1, v6, v4 -; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX9-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: v_bfi_b32 v2, s2, v2, v1 +; GFX9-NEXT: .LBB9_8: +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-NEXT: v_cvt_f32_f16_e64 v6, |v3| +; GFX9-NEXT: v_cvt_f32_f16_sdwa v5, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5 +; GFX9-NEXT: s_cbranch_vccz .LBB9_10 +; GFX9-NEXT: ; %bb.9: ; %frem.else20 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: v_bfi_b32 v4, s2, 0, v3 +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; GFX9-NEXT: s_cbranch_execz .LBB9_11 +; GFX9-NEXT: s_branch .LBB9_16 +; GFX9-NEXT: .LBB9_10: +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: .LBB9_11: ; %frem.compute19 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v9, v6 +; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v6 +; GFX9-NEXT: v_frexp_mant_f32_e32 v6, v5 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v10, v5 +; GFX9-NEXT: v_ldexp_f32 v5, v6, 1 +; GFX9-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0 +; GFX9-NEXT: v_div_scale_f32 v8, vcc, 1.0, v5, 1.0 +; GFX9-NEXT: v_ldexp_f32 v7, v4, 11 +; GFX9-NEXT: v_add_u32_e32 v4, -1, v10 +; GFX9-NEXT: v_not_b32_e32 v6, v4 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v9 +; GFX9-NEXT: v_rcp_f32_e32 v12, v11 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX9-NEXT: v_fma_f32 v13, -v11, v12, 1.0 +; GFX9-NEXT: v_fma_f32 v12, v13, v12, v12 +; GFX9-NEXT: v_mul_f32_e32 v13, v8, v12 +; GFX9-NEXT: v_fma_f32 v14, -v11, v13, v8 +; GFX9-NEXT: v_fma_f32 v13, v14, v12, v13 +; GFX9-NEXT: v_fma_f32 v8, -v11, v13, v8 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX9-NEXT: v_div_fmas_f32 v8, v8, v12, v13 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6 +; GFX9-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB9_15 +; GFX9-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX9-NEXT: v_sub_u32_e32 v6, v9, v10 +; GFX9-NEXT: v_add_u32_e32 v6, 11, v6 +; GFX9-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-NEXT: v_mul_f32_e32 v7, v9, v8 +; GFX9-NEXT: v_rndne_f32_e32 v7, v7 +; GFX9-NEXT: v_fma_f32 v7, -v7, v5, v9 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; GFX9-NEXT: v_add_f32_e32 v10, v7, v5 +; GFX9-NEXT: v_add_u32_e32 v6, -11, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 11, v6 +; GFX9-NEXT: v_ldexp_f32 v7, v7, 11 +; GFX9-NEXT: s_cbranch_vccnz .LBB9_13 +; GFX9-NEXT: ; %bb.14: ; %Flow +; GFX9-NEXT: v_mov_b32_e32 v7, v9 +; GFX9-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX9-NEXT: v_add_u32_e32 v6, -10, v6 +; GFX9-NEXT: v_ldexp_f32 v6, v7, v6 +; GFX9-NEXT: v_mul_f32_e32 v7, v6, v8 +; GFX9-NEXT: v_rndne_f32_e32 v7, v7 +; GFX9-NEXT: v_fma_f32 v6, -v7, v5, v6 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX9-NEXT: v_ldexp_f32 v4, v5, v4 +; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: v_bfi_b32 v4, s2, v4, v3 +; GFX9-NEXT: .LBB9_16: ; %Flow54 +; GFX9-NEXT: s_movk_i32 s4, 0x7c00 +; GFX9-NEXT: v_cmp_lg_f16_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_nge_f16_e64 s[2:3], |v1|, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: s_and_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 +; GFX9-NEXT: v_cmp_lg_f16_sdwa s[2:3], v0, v5 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cmp_nge_f16_e64 s[4:5], |v3|, s4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GFX9-NEXT: s_and_b64 vcc, s[4:5], s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: global_store_dword v5, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: frem_v2f16: @@ -2542,45 +5604,168 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:16 +; GFX10-NEXT: global_load_dword v1, v2, s[2:3] +; GFX10-NEXT: global_load_dword v0, v2, s[6:7] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_e64 v4, |v1| ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-NEXT: v_rcp_f32_e32 v5, v4 -; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 -; GFX10-NEXT: v_mad_f32 v7, -v4, v6, v3 -; GFX10-NEXT: v_mac_f32_e32 v6, v7, v5 -; GFX10-NEXT: v_mad_f32 v3, -v4, v6, v3 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 -; GFX10-NEXT: v_trunc_f16_e32 v3, v3 -; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX10-NEXT: v_rcp_f32_e32 v6, v5 -; GFX10-NEXT: v_mul_f32_e32 v7, v4, v6 -; GFX10-NEXT: v_mad_f32 v8, -v5, v7, v4 -; GFX10-NEXT: v_mac_f32_e32 v7, v8, v6 -; GFX10-NEXT: v_mad_f32 v4, -v5, v7, v4 -; GFX10-NEXT: v_mul_f32_e32 v4, v4, v6 -; GFX10-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v7 +; GFX10-NEXT: v_cvt_f32_f16_e64 v3, |v0| +; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: s_cbranch_vccz .LBB9_2 +; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, 0, v1 +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB9_3 +; GFX10-NEXT: s_branch .LBB9_8 +; GFX10-NEXT: .LBB9_2: +; GFX10-NEXT: ; implicit-def: $vgpr2 +; GFX10-NEXT: .LBB9_3: ; %frem.compute +; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v4 +; GFX10-NEXT: v_frexp_mant_f32_e32 v6, v3 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v5, v4 +; GFX10-NEXT: v_ldexp_f32 v4, v2, 11 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v2, v3 +; GFX10-NEXT: v_ldexp_f32 v3, v6, 1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v5 +; GFX10-NEXT: v_readfirstlane_b32 s3, v2 +; GFX10-NEXT: v_div_scale_f32 v7, s4, v3, v3, 1.0 +; GFX10-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX10-NEXT: v_rcp_f32_e32 v8, v7 +; GFX10-NEXT: v_not_b32_e32 v6, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v8, v9, v8 +; GFX10-NEXT: v_mul_f32_e32 v9, v5, v8 +; GFX10-NEXT: v_fma_f32 v10, -v7, v9, v5 +; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX10-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX10-NEXT: s_denorm_mode 12 +; GFX10-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 +; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX10-NEXT: s_cbranch_vccnz .LBB9_7 +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s2, s2, 11 +; GFX10-NEXT: .LBB9_5: ; %frem.loop_body +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v7, v4 +; GFX10-NEXT: s_add_i32 s2, s2, -11 +; GFX10-NEXT: s_cmp_gt_i32 s2, 11 +; GFX10-NEXT: v_mul_f32_e32 v4, v7, v5 +; GFX10-NEXT: v_rndne_f32_e32 v4, v4 +; GFX10-NEXT: v_fma_f32 v4, -v4, v3, v7 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v4, v4, 11 +; GFX10-NEXT: s_cbranch_scc1 .LBB9_5 +; GFX10-NEXT: ; %bb.6: ; %Flow55 +; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX10-NEXT: v_add_nc_u32_e32 v6, -10, v6 +; GFX10-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX10-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX10-NEXT: v_rndne_f32_e32 v5, v5 +; GFX10-NEXT: v_fma_f32 v4, -v5, v3, v4 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, v2, v1 +; GFX10-NEXT: .LBB9_8: +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v4, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e64 v6, |v3| +; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v4 +; GFX10-NEXT: s_cbranch_vccz .LBB9_10 +; GFX10-NEXT: ; %bb.9: ; %frem.else20 +; GFX10-NEXT: v_bfi_b32 v5, 0x7fff, 0, v3 +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB9_11 +; GFX10-NEXT: s_branch .LBB9_16 +; GFX10-NEXT: .LBB9_10: +; GFX10-NEXT: ; implicit-def: $vgpr5 +; GFX10-NEXT: .LBB9_11: ; %frem.compute19 +; GFX10-NEXT: v_frexp_mant_f32_e32 v5, v6 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v7, v6 +; GFX10-NEXT: v_ldexp_f32 v6, v5, 11 +; GFX10-NEXT: v_frexp_mant_f32_e32 v5, v4 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v4, v4 +; GFX10-NEXT: v_readfirstlane_b32 s2, v7 +; GFX10-NEXT: v_ldexp_f32 v5, v5, 1 +; GFX10-NEXT: v_readfirstlane_b32 s3, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v4, -1, v4 +; GFX10-NEXT: v_div_scale_f32 v9, s4, v5, v5, 1.0 +; GFX10-NEXT: v_not_b32_e32 v8, v4 +; GFX10-NEXT: v_rcp_f32_e32 v10, v9 +; GFX10-NEXT: v_add_nc_u32_e32 v8, v8, v7 +; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, 1.0, v5, 1.0 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v10 +; GFX10-NEXT: v_mul_f32_e32 v11, v7, v10 +; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v7 +; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v10 +; GFX10-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX10-NEXT: s_denorm_mode 12 +; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v8 +; GFX10-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 +; GFX10-NEXT: s_cbranch_vccnz .LBB9_15 +; GFX10-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s2, s2, 11 +; GFX10-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-NEXT: s_add_i32 s2, s2, -11 +; GFX10-NEXT: s_cmp_gt_i32 s2, 11 +; GFX10-NEXT: v_mul_f32_e32 v6, v9, v7 +; GFX10-NEXT: v_rndne_f32_e32 v6, v6 +; GFX10-NEXT: v_fma_f32 v6, -v6, v5, v9 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: v_add_f32_e32 v8, v6, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v6, v6, 11 +; GFX10-NEXT: s_cbranch_scc1 .LBB9_13 +; GFX10-NEXT: ; %bb.14: ; %Flow +; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: v_mov_b32_e32 v6, v9 +; GFX10-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX10-NEXT: v_add_nc_u32_e32 v8, -10, v8 +; GFX10-NEXT: v_ldexp_f32 v6, v6, v8 +; GFX10-NEXT: v_mul_f32_e32 v7, v6, v7 +; GFX10-NEXT: v_rndne_f32_e32 v7, v7 +; GFX10-NEXT: v_fma_f32 v6, -v7, v5, v6 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v4, v5, v4 ; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX10-NEXT: v_div_fixup_f16 v4, v4, v2, v1 -; GFX10-NEXT: v_trunc_f16_e32 v4, v4 -; GFX10-NEXT: v_fma_f16 v1, -v4, v2, v1 -; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_bfi_b32 v5, 0x7fff, v4, v3 +; GFX10-NEXT: .LBB9_16: ; %Flow54 +; GFX10-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v1| +; GFX10-NEXT: v_cmp_nle_f16_e64 s3, 0x7c00, |v3| +; GFX10-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_cmp_lg_f16_sdwa s2, v0, v2 src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: s_and_b32 vcc_lo, s3, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: frem_v2f16: @@ -2591,55 +5776,209 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3] -; GFX11-TRUE16-NEXT: global_load_b32 v3, v1, s[4:5] offset:16 +; GFX11-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3] +; GFX11-TRUE16-NEXT: global_load_b32 v1, v1, s[4:5] offset:16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v4, |v0.l| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v7, v7 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v3, |v1.l| +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, v0 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.l, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_3 +; GFX11-TRUE16-NEXT: s_branch .LBB9_8 +; GFX11-TRUE16-NEXT: .LBB9_2: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: .LBB9_3: ; %frem.compute +; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v6, v3 +; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_ldexp_f32 v4, v2, 11 +; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v2, v3 +; GFX11-TRUE16-NEXT: v_ldexp_f32 v3, v6, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_div_scale_f32 v7, null, v3, v3, 1.0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v8, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX11-TRUE16-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 +; GFX11-TRUE16-NEXT: s_denorm_mode 15 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fma_f32 v9, -v7, v8, 1.0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v8, v9, v8 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v9, v5, v8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v5, v4.l -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v5, v5, v7 -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: v_fma_f32 v10, -v7, v9, v5 +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX11-TRUE16-NEXT: s_denorm_mode 12 +; GFX11-TRUE16-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB9_7 +; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 +; GFX11-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v4 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, -11 +; GFX11-TRUE16-NEXT: s_cmp_gt_i32 s2, 11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v7, v5 +; GFX11-TRUE16-NEXT: v_rndne_f32_e32 v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v4, -v4, v3, v7 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_ldexp_f32 v4, v4, 11 +; GFX11-TRUE16-NEXT: s_cbranch_scc1 .LBB9_5 +; GFX11-TRUE16-NEXT: ; %bb.6: ; %Flow55 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v7 +; GFX11-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 +; GFX11-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX11-TRUE16-NEXT: v_rndne_f32_e32 v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v4, -v5, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v0 +; GFX11-TRUE16-NEXT: .LBB9_8: +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v7, |v3.l| +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v5, |v4.l| +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v7, v5 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_10 +; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0x7fff, 0, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v7, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v3.l, v6.l, vcc_lo +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_11 +; GFX11-TRUE16-NEXT: s_branch .LBB9_16 +; GFX11-TRUE16-NEXT: .LBB9_10: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6 +; GFX11-TRUE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v8, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_ldexp_f32 v7, v6, 11 +; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_ldexp_f32 v6, v6, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, -1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_div_scale_f32 v10, null, v6, v6, 1.0 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v9, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v11, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v8 +; GFX11-TRUE16-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v6, 1.0 +; GFX11-TRUE16-NEXT: s_denorm_mode 15 +; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-TRUE16-NEXT: v_fma_f32 v12, -v10, v11, 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v11, v12, v11 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v12, v8, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v13, -v10, v12, v8 +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v12, v13, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v8, -v10, v12, v8 +; GFX11-TRUE16-NEXT: s_denorm_mode 12 +; GFX11-TRUE16-NEXT: v_div_fmas_f32 v8, v8, v11, v12 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB9_15 +; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 +; GFX11-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, -11 +; GFX11-TRUE16-NEXT: s_cmp_gt_i32 s2, 11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v7, v10, v8 +; GFX11-TRUE16-NEXT: v_rndne_f32_e32 v7, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v7, -v7, v6, v10 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_ldexp_f32 v7, v7, 11 +; GFX11-TRUE16-NEXT: s_cbranch_scc1 .LBB9_13 +; GFX11-TRUE16-NEXT: ; %bb.14: ; %Flow +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v10 +; GFX11-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit28 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v5, v8, v7 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v2, -v3, v5, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, -10, v9 +; GFX11-TRUE16-NEXT: v_ldexp_f32 v7, v7, v9 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v8, v7, v8 +; GFX11-TRUE16-NEXT: v_rndne_f32_e32 v8, v8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2 +; GFX11-TRUE16-NEXT: v_fma_f32 v7, -v8, v6, v7 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v7, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v6.l, v4.l -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_ldexp_f32 v5, v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fma_f16 v0.h, -v0.h, v6.l, v4.l -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v5.l, v5 +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0x7fff, v5, v3 +; GFX11-TRUE16-NEXT: .LBB9_16: ; %Flow54 +; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v1.l +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v0.l| +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v4.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, s2 +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v3.l| +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v6.l, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -2648,123 +5987,445 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-FAKE16-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 +; GFX11-FAKE16-NEXT: global_load_b32 v0, v1, s[2:3] +; GFX11-FAKE16-NEXT: global_load_b32 v1, v1, s[4:5] offset:16 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v4, |v0| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v7, v7 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v3, |v1| +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB9_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, v0 +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_3 +; GFX11-FAKE16-NEXT: s_branch .LBB9_8 +; GFX11-FAKE16-NEXT: .LBB9_2: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-FAKE16-NEXT: .LBB9_3: ; %frem.compute +; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v6, v3 +; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_ldexp_f32 v4, v2, 11 +; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v2, v3 +; GFX11-FAKE16-NEXT: v_ldexp_f32 v3, v6, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s2, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_div_scale_f32 v7, null, v3, v3, 1.0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v8, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 +; GFX11-FAKE16-NEXT: s_denorm_mode 15 ; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX11-FAKE16-NEXT: v_fma_f32 v9, -v7, v8, 1.0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v8, v9, v8 +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v9, v5, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_fma_f32 v10, -v7, v9, v5 +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX11-FAKE16-NEXT: s_denorm_mode 12 +; GFX11-FAKE16-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB9_7 +; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 +; GFX11-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v4 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, -11 +; GFX11-FAKE16-NEXT: s_cmp_gt_i32 s2, 11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v4, v7, v5 +; GFX11-FAKE16-NEXT: v_rndne_f32_e32 v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v4, -v4, v3, v7 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_ldexp_f32 v4, v4, 11 +; GFX11-FAKE16-NEXT: s_cbranch_scc1 .LBB9_5 +; GFX11-FAKE16-NEXT: ; %bb.6: ; %Flow55 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, s2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v7 +; GFX11-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 +; GFX11-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX11-FAKE16-NEXT: v_rndne_f32_e32 v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v4, -v5, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v0 +; GFX11-FAKE16-NEXT: .LBB9_8: +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX11-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v7 -; GFX11-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-FAKE16-NEXT: v_fma_f16 v3, -v3, v2, v1 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v7, |v3| +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v5, |v4| +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v7, v5 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB9_10 +; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX11-FAKE16-NEXT: v_bfi_b32 v6, 0x7fff, 0, v3 +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v7, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v3, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_11 +; GFX11-FAKE16-NEXT: s_branch .LBB9_16 +; GFX11-FAKE16-NEXT: .LBB9_10: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 +; GFX11-FAKE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v6, v7 +; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v8, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_ldexp_f32 v7, v6, 11 +; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v6, v5 +; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s2, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_ldexp_f32 v6, v6, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, -1, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_div_scale_f32 v10, null, v6, v6, 1.0 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v9, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v11, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v8 +; GFX11-FAKE16-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v6, 1.0 +; GFX11-FAKE16-NEXT: s_denorm_mode 15 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FAKE16-NEXT: v_fma_f32 v12, -v10, v11, 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v11, v12, v11 +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v12, v8, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v13, -v10, v12, v8 +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v12, v13, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v8, -v10, v12, v8 +; GFX11-FAKE16-NEXT: s_denorm_mode 12 +; GFX11-FAKE16-NEXT: v_div_fmas_f32 v8, v8, v11, v12 +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB9_15 +; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 +; GFX11-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, -11 +; GFX11-FAKE16-NEXT: s_cmp_gt_i32 s2, 11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v7, v10, v8 +; GFX11-FAKE16-NEXT: v_rndne_f32_e32 v7, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v7, -v7, v6, v10 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_ldexp_f32 v7, v7, 11 +; GFX11-FAKE16-NEXT: s_cbranch_scc1 .LBB9_13 +; GFX11-FAKE16-NEXT: ; %bb.14: ; %Flow +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, s2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v10 +; GFX11-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit28 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v5, v8, v7 -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, -10, v9 +; GFX11-FAKE16-NEXT: v_ldexp_f32 v7, v7, v9 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v1, v1, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v8, v7, v8 +; GFX11-FAKE16-NEXT: v_rndne_f32_e32 v8, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_fma_f32 v7, -v8, v6, v7 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v7, v6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_div_fixup_f16 v1, v1, v6, v4 -; GFX11-FAKE16-NEXT: v_trunc_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_ldexp_f32 v5, v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_fma_f16 v1, -v1, v6, v4 -; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v3, v1 -; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_bfi_b32 v6, 0x7fff, v5, v3 +; GFX11-FAKE16-NEXT: .LBB9_16: ; %Flow54 +; GFX11-FAKE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v1 +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v0| +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-FAKE16-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v3| +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX1150-TRUE16-LABEL: frem_v2f16: ; GFX1150-TRUE16: ; %bb.0: ; GFX1150-TRUE16-NEXT: s_clause 0x1 ; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1150-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-TRUE16-NEXT: s_clause 0x1 -; GFX1150-TRUE16-NEXT: global_load_b32 v3, v2, s[2:3] -; GFX1150-TRUE16-NEXT: global_load_b32 v4, v2, s[4:5] offset:16 +; GFX1150-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1150-TRUE16-NEXT: global_load_b32 v0, v0, s[6:7] offset:16 ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.h +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v4.h -; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v1, v1 -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1150-TRUE16-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX1150-TRUE16-NEXT: s_and_b32 s5, s3, 0x7fff +; GFX1150-TRUE16-NEXT: s_cvt_f32_f16 s6, s2 +; GFX1150-TRUE16-NEXT: s_cvt_f32_f16 s5, s5 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5 +; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2 +; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s6, s5 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s4 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s8, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, v0.l, s8 +; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB9_3 +; GFX1150-TRUE16-NEXT: s_branch .LBB9_8 +; GFX1150-TRUE16-NEXT: .LBB9_2: +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr0 +; GFX1150-TRUE16-NEXT: .LBB9_3: ; %frem.compute +; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s5 +; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s6 +; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v1, v1, 1 +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v0, 11 +; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v0, s5 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1150-TRUE16-NEXT: v_div_scale_f32 v5, null, v1, v1, 1.0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v5 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_not_b32_e32 v4, v0 +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v3 +; GFX1150-TRUE16-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 +; GFX1150-TRUE16-NEXT: s_denorm_mode 15 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v6, v7, v6 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v4, v0, v3 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v1 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v7, v3, v6 +; GFX1150-TRUE16-NEXT: v_fma_f32 v8, -v5, v7, v3 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v4, v0, v3 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v1, v5, v1 -; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.l -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v7, v8, v6 +; GFX1150-TRUE16-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX1150-TRUE16-NEXT: s_denorm_mode 12 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v6.l, v1.l -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v4.l -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 +; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB9_7 +; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-TRUE16-NEXT: s_sub_i32 s5, s6, s5 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: s_add_i32 s5, s5, 11 +; GFX1150-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, v2 +; GFX1150-TRUE16-NEXT: s_add_i32 s5, s5, -11 +; GFX1150-TRUE16-NEXT: s_cmp_gt_i32 s5, 11 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v2, v5, v3 +; GFX1150-TRUE16-NEXT: v_rndne_f32_e32 v2, v2 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX1150-TRUE16-NEXT: v_fma_f32 v2, v2, v1, v5 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v4, v2, v1 +; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v2, 11 +; GFX1150-TRUE16-NEXT: s_cbranch_scc1 .LBB9_5 +; GFX1150-TRUE16-NEXT: ; %bb.6: ; %Flow55 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, s5 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, v5 +; GFX1150-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v1, v1 -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX1150-TRUE16-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v2, v3, v1 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v5.l, v0.l, v6.l -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.l -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4 +; GFX1150-TRUE16-NEXT: .LBB9_8: +; GFX1150-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX1150-TRUE16-NEXT: s_lshr_b32 s5, s3, 16 +; GFX1150-TRUE16-NEXT: s_and_b32 s4, s6, 0x7fff +; GFX1150-TRUE16-NEXT: s_and_b32 s7, s5, 0x7fff +; GFX1150-TRUE16-NEXT: s_cvt_f32_f16 s8, s4 +; GFX1150-TRUE16-NEXT: s_cvt_f32_f16 s7, s7 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s8, s7 +; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB9_10 +; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s8, s7 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s6 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s10, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v1.l, s6, v1.l, s10 +; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB9_11 +; GFX1150-TRUE16-NEXT: s_branch .LBB9_16 +; GFX1150-TRUE16-NEXT: .LBB9_10: +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX1150-TRUE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s7 +; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s8 +; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v2, 1 +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v1, 11 +; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v1, s7 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX1150-TRUE16-NEXT: v_div_scale_f32 v6, null, v2, v2, 1.0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v7, v6 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v3 op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v1 +; GFX1150-TRUE16-NEXT: v_not_b32_e32 v5, v1 +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v4 +; GFX1150-TRUE16-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0 +; GFX1150-TRUE16-NEXT: s_denorm_mode 15 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_f32 v8, -v6, v7, 1.0 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v7, v8, v7 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v8, v4, v7 +; GFX1150-TRUE16-NEXT: v_fma_f32 v9, -v6, v8, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v8, v9, v7 +; GFX1150-TRUE16-NEXT: v_fma_f32 v4, -v6, v8, v4 +; GFX1150-TRUE16-NEXT: s_denorm_mode 12 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-TRUE16-NEXT: v_div_fmas_f32 v4, v4, v7, v8 +; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 +; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB9_15 +; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1150-TRUE16-NEXT: s_sub_i32 s7, s8, s7 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: s_add_i32 s7, s7, 11 +; GFX1150-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX1150-TRUE16-NEXT: s_add_i32 s7, s7, -11 +; GFX1150-TRUE16-NEXT: s_cmp_gt_i32 s7, 11 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v3 op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v1, v6, v1 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v6, v4 +; GFX1150-TRUE16-NEXT: v_rndne_f32_e32 v3, v3 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1150-TRUE16-NEXT: v_fma_f32 v3, v3, v2, v6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v5, v3, v2 +; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v3, 11 +; GFX1150-TRUE16-NEXT: s_cbranch_scc1 .LBB9_13 +; GFX1150-TRUE16-NEXT: ; %bb.14: ; %Flow +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, s7 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v3, v6 +; GFX1150-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit28 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v1.l +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX1150-TRUE16-NEXT: v_rndne_f32_e32 v4, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v4, v2 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v4.l -; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v0, v3.l, v5.l +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, s6 +; GFX1150-TRUE16-NEXT: .LBB9_16: ; %Flow54 +; GFX1150-TRUE16-NEXT: s_cmp_lg_f16 s3, 0 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s2, 0x7c00 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: s_and_b32 s2, s2, s3 +; GFX1150-TRUE16-NEXT: s_cmp_lg_f16 s5, 0 +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s4, 0x7c00 +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: s_and_b32 s2, s3, s2 +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s2 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX1150-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1150-TRUE16-NEXT: s_endpgm ; @@ -2772,127 +6433,474 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16: ; %bb.0: ; GFX1150-FAKE16-NEXT: s_clause 0x1 ; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-FAKE16-NEXT: s_clause 0x1 ; GFX1150-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1150-FAKE16-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 +; GFX1150-FAKE16-NEXT: global_load_b32 v0, v0, s[6:7] offset:16 ; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v6, v6 -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v6 +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1150-FAKE16-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX1150-FAKE16-NEXT: s_and_b32 s5, s3, 0x7fff +; GFX1150-FAKE16-NEXT: s_cvt_f32_f16 s6, s2 +; GFX1150-FAKE16-NEXT: s_cvt_f32_f16 s5, s5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s6, s5 +; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB9_2 +; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s6, s5 +; GFX1150-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s4 +; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v0, s4, v0, vcc_lo +; GFX1150-FAKE16-NEXT: s_cbranch_execz .LBB9_3 +; GFX1150-FAKE16-NEXT: s_branch .LBB9_8 +; GFX1150-FAKE16-NEXT: .LBB9_2: +; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr0 +; GFX1150-FAKE16-NEXT: .LBB9_3: ; %frem.compute +; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s5 +; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s6 +; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v1, v1, 1 +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v0, 11 +; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v0, s5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1150-FAKE16-NEXT: v_div_scale_f32 v5, null, v1, v1, 1.0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v6, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_not_b32_e32 v4, v0 +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v3 +; GFX1150-FAKE16-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 +; GFX1150-FAKE16-NEXT: s_denorm_mode 15 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v7, v3, v6 +; GFX1150-FAKE16-NEXT: v_fma_f32 v8, -v5, v7, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v7, v8, v6 +; GFX1150-FAKE16-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX1150-FAKE16-NEXT: s_denorm_mode 12 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 +; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB9_7 +; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-FAKE16-NEXT: s_sub_i32 s5, s6, s5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-FAKE16-NEXT: s_add_i32 s5, s5, 11 +; GFX1150-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, v2 +; GFX1150-FAKE16-NEXT: s_add_i32 s5, s5, -11 +; GFX1150-FAKE16-NEXT: s_cmp_gt_i32 s5, 11 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v7, v6 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v2, v5, v3 +; GFX1150-FAKE16-NEXT: v_rndne_f32_e32 v2, v2 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX1150-FAKE16-NEXT: v_fma_f32 v2, v2, v1, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v4, v2, v1 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v2, 11 +; GFX1150-FAKE16-NEXT: s_cbranch_scc1 .LBB9_5 +; GFX1150-FAKE16-NEXT: ; %bb.6: ; %Flow55 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, s5 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v2, v5 +; GFX1150-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX1150-FAKE16-NEXT: v_rndne_f32_e32 v3, v3 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX1150-FAKE16-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v2, v3, v1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v5, v3 +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4 +; GFX1150-FAKE16-NEXT: .LBB9_8: +; GFX1150-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX1150-FAKE16-NEXT: s_lshr_b32 s5, s3, 16 +; GFX1150-FAKE16-NEXT: s_and_b32 s4, s6, 0x7fff +; GFX1150-FAKE16-NEXT: s_and_b32 s7, s5, 0x7fff +; GFX1150-FAKE16-NEXT: s_cvt_f32_f16 s8, s4 +; GFX1150-FAKE16-NEXT: s_cvt_f32_f16 s7, s7 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s8, s7 +; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB9_10 +; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s8, s7 +; GFX1150-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s6 +; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v1, s6, v1, vcc_lo +; GFX1150-FAKE16-NEXT: s_cbranch_execz .LBB9_11 +; GFX1150-FAKE16-NEXT: s_branch .LBB9_16 +; GFX1150-FAKE16-NEXT: .LBB9_10: +; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr1 +; GFX1150-FAKE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s7 +; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s8 +; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v2, 1 +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v3, v1, 11 +; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v1, s7 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX1150-FAKE16-NEXT: v_div_scale_f32 v6, null, v2, v2, 1.0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v7, v6 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v4, v4 -; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 -; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v3, v4, v5 -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v5, v5 +; GFX1150-FAKE16-NEXT: v_not_b32_e32 v5, v1 +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v4 +; GFX1150-FAKE16-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0 +; GFX1150-FAKE16-NEXT: s_denorm_mode 15 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v5 -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_fma_f32 v8, -v6, v7, 1.0 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v7, v8, v7 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v6, v5 -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v8, v4, v7 +; GFX1150-FAKE16-NEXT: v_fma_f32 v9, -v6, v8, v4 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v6, v5 -; GFX1150-FAKE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v8, v9, v7 +; GFX1150-FAKE16-NEXT: v_fma_f32 v4, -v6, v8, v4 +; GFX1150-FAKE16-NEXT: s_denorm_mode 12 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_div_fmas_f32 v4, v4, v7, v8 +; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 +; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB9_15 +; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1150-FAKE16-NEXT: s_sub_i32 s7, s8, s7 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-FAKE16-NEXT: s_add_i32 s7, s7, 11 +; GFX1150-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX1150-FAKE16-NEXT: s_add_i32 s7, s7, -11 +; GFX1150-FAKE16-NEXT: s_cmp_gt_i32 s7, 11 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_add_f32_e32 v4, v5, v4 -; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v3, v6, v4 +; GFX1150-FAKE16-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1150-FAKE16-NEXT: v_fma_f32 v3, v3, v2, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v5, v3, v2 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v3, v3, 11 +; GFX1150-FAKE16-NEXT: s_cbranch_scc1 .LBB9_13 +; GFX1150-FAKE16-NEXT: ; %bb.14: ; %Flow +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, s7 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v3, v6 +; GFX1150-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v2, v1 -; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v4, v4 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX1150-FAKE16-NEXT: v_rndne_f32_e32 v4, v4 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 -; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v4, v2 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v3, v4, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v3 -; GFX1150-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1150-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, s6 +; GFX1150-FAKE16-NEXT: .LBB9_16: ; %Flow54 +; GFX1150-FAKE16-NEXT: s_cmp_lg_f16 s3, 0 +; GFX1150-FAKE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1150-FAKE16-NEXT: s_cmp_nge_f16 s2, 0x7c00 +; GFX1150-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: s_and_b32 vcc_lo, s2, s3 +; GFX1150-FAKE16-NEXT: s_cmp_lg_f16 s5, 0 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX1150-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1150-FAKE16-NEXT: s_cmp_nge_f16 s4, 0x7c00 +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1150-FAKE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: s_and_b32 vcc_lo, s3, s2 +; GFX1150-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0x7e00, v1 +; GFX1150-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX1150-FAKE16-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1150-FAKE16-NEXT: s_endpgm ; ; GFX1200-TRUE16-LABEL: frem_v2f16: ; GFX1200-TRUE16: ; %bb.0: ; GFX1200-TRUE16-NEXT: s_clause 0x1 ; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1200-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v0, 0 ; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1200-TRUE16-NEXT: s_clause 0x1 -; GFX1200-TRUE16-NEXT: global_load_b32 v3, v2, s[2:3] -; GFX1200-TRUE16-NEXT: global_load_b32 v4, v2, s[4:5] offset:16 +; GFX1200-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1200-TRUE16-NEXT: global_load_b32 v0, v0, s[6:7] offset:16 ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.h +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v4.h -; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v1, v1 -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1200-TRUE16-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX1200-TRUE16-NEXT: s_and_b32 s5, s3, 0x7fff +; GFX1200-TRUE16-NEXT: s_cvt_f32_f16 s6, s2 +; GFX1200-TRUE16-NEXT: s_cvt_f32_f16 s5, s5 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5 +; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2 +; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s6, s5 +; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s4 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s8, -1, 0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, v0.l, s8 +; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB9_3 +; GFX1200-TRUE16-NEXT: s_branch .LBB9_8 +; GFX1200-TRUE16-NEXT: .LBB9_2: +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr0 +; GFX1200-TRUE16-NEXT: .LBB9_3: ; %frem.compute +; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s5 +; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s6 +; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v1, v1, 1 +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v0, 11 +; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v0, s5 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1200-TRUE16-NEXT: v_div_scale_f32 v5, null, v1, v1, 1.0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v6, v5 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_not_b32_e32 v4, v0 +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v3 +; GFX1200-TRUE16-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 +; GFX1200-TRUE16-NEXT: s_denorm_mode 15 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v7, v3, v6 +; GFX1200-TRUE16-NEXT: v_fma_f32 v8, -v5, v7, v3 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v7, v8, v6 +; GFX1200-TRUE16-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX1200-TRUE16-NEXT: s_denorm_mode 12 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-TRUE16-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 +; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB9_7 +; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-TRUE16-NEXT: s_sub_co_i32 s5, s6, s5 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_add_co_i32 s5, s5, 11 +; GFX1200-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, v2 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_add_co_i32 s5, s5, -11 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_cmp_gt_i32 s5, 11 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v2, v5, v3 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_rndne_f32_e32 v2, v2 +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v4, v0, v3 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v1 +; GFX1200-TRUE16-NEXT: v_fma_f32 v2, v2, v1, v5 +; GFX1200-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v4, v2, v1 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v4, v0, v3 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v1, v5, v1 -; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v2, 11 +; GFX1200-TRUE16-NEXT: s_cbranch_scc1 .LBB9_5 +; GFX1200-TRUE16-NEXT: ; %bb.6: ; %Flow55 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v4, s5 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, v5 +; GFX1200-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.l +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX1200-TRUE16-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v2, v3, v1 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4 +; GFX1200-TRUE16-NEXT: .LBB9_8: +; GFX1200-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX1200-TRUE16-NEXT: s_lshr_b32 s5, s3, 16 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_and_b32 s4, s6, 0x7fff +; GFX1200-TRUE16-NEXT: s_and_b32 s7, s5, 0x7fff +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_cvt_f32_f16 s8, s4 +; GFX1200-TRUE16-NEXT: s_cvt_f32_f16 s7, s7 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s8, s7 +; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB9_10 +; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s8, s7 +; GFX1200-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s6 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s10, -1, 0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v1.l, s6, v1.l, s10 +; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB9_11 +; GFX1200-TRUE16-NEXT: s_branch .LBB9_16 +; GFX1200-TRUE16-NEXT: .LBB9_10: +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX1200-TRUE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s7 +; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s8 +; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v2, 1 +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v1, 11 +; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v1, s7 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX1200-TRUE16-NEXT: v_div_scale_f32 v6, null, v2, v2, 1.0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v7, v6 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_not_b32_e32 v5, v1 +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v4 +; GFX1200-TRUE16-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0 +; GFX1200-TRUE16-NEXT: s_denorm_mode 15 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_f32 v8, -v6, v7, 1.0 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v7, v8, v7 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v8, v4, v7 +; GFX1200-TRUE16-NEXT: v_fma_f32 v9, -v6, v8, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v8, v9, v7 +; GFX1200-TRUE16-NEXT: v_fma_f32 v4, -v6, v8, v4 +; GFX1200-TRUE16-NEXT: s_denorm_mode 12 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v6.l, v1.l -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v4.l -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v1, v1 -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-TRUE16-NEXT: v_div_fmas_f32 v4, v4, v7, v8 +; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 +; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB9_15 +; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1200-TRUE16-NEXT: s_sub_co_i32 s7, s8, s7 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_add_co_i32 s7, s7, 11 +; GFX1200-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v5.l, v0.l, v6.l -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.l -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_add_co_i32 s7, s7, -11 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_cmp_gt_i32 s7, 11 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v6, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v3 op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v1 +; GFX1200-TRUE16-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v3 op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v1, v6, v1 +; GFX1200-TRUE16-NEXT: v_fma_f32 v3, v3, v2, v6 +; GFX1200-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v5, v3, v2 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v3, 11 +; GFX1200-TRUE16-NEXT: s_cbranch_scc1 .LBB9_13 +; GFX1200-TRUE16-NEXT: ; %bb.14: ; %Flow +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, s7 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v3, v6 +; GFX1200-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit28 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v1.l +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX1200-TRUE16-NEXT: v_rndne_f32_e32 v4, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v4.l -; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v0, v3.l, v5.l +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v4, v2 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, s6 +; GFX1200-TRUE16-NEXT: .LBB9_16: ; %Flow54 +; GFX1200-TRUE16-NEXT: s_cmp_lg_f16 s3, 0 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-TRUE16-NEXT: s_cmp_nge_f16 s2, 0x7c00 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: s_and_b32 s2, s2, s3 +; GFX1200-TRUE16-NEXT: s_cmp_lg_f16 s5, 0 +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1200-TRUE16-NEXT: s_cmp_nge_f16 s4, 0x7c00 +; GFX1200-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_and_b32 s2, s3, s2 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s2 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX1200-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1200-TRUE16-NEXT: s_endpgm ; @@ -2900,63 +6908,246 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16: ; %bb.0: ; GFX1200-FAKE16-NEXT: s_clause 0x1 ; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1200-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1200-FAKE16-NEXT: s_clause 0x1 ; GFX1200-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1200-FAKE16-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 +; GFX1200-FAKE16-NEXT: global_load_b32 v0, v0, s[6:7] offset:16 ; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1 -; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v6, v6 -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v6 +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1200-FAKE16-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX1200-FAKE16-NEXT: s_and_b32 s5, s3, 0x7fff +; GFX1200-FAKE16-NEXT: s_cvt_f32_f16 s6, s2 +; GFX1200-FAKE16-NEXT: s_cvt_f32_f16 s5, s5 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s6, s5 +; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB9_2 +; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s6, s5 +; GFX1200-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s4 +; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v0, s4, v0, vcc_lo +; GFX1200-FAKE16-NEXT: s_cbranch_execz .LBB9_3 +; GFX1200-FAKE16-NEXT: s_branch .LBB9_8 +; GFX1200-FAKE16-NEXT: .LBB9_2: +; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr0 +; GFX1200-FAKE16-NEXT: .LBB9_3: ; %frem.compute +; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s5 +; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s6 +; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v1, v1, 1 +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v0, 11 +; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v0, s5 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1200-FAKE16-NEXT: v_div_scale_f32 v5, null, v1, v1, 1.0 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v6, v5 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_not_b32_e32 v4, v0 +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v3 +; GFX1200-FAKE16-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 +; GFX1200-FAKE16-NEXT: s_denorm_mode 15 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v7, v3, v6 +; GFX1200-FAKE16-NEXT: v_fma_f32 v8, -v5, v7, v3 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v7, v8, v6 +; GFX1200-FAKE16-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX1200-FAKE16-NEXT: s_denorm_mode 12 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-FAKE16-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 +; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB9_7 +; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-FAKE16-NEXT: s_sub_co_i32 s5, s6, s5 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_add_co_i32 s5, s5, 11 +; GFX1200-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, v2 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_add_co_i32 s5, s5, -11 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_cmp_gt_i32 s5, 11 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v2, v5, v3 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_rndne_f32_e32 v2, v2 +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v4, v7, v6 +; GFX1200-FAKE16-NEXT: v_fma_f32 v2, v2, v1, v5 +; GFX1200-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v4, v2, v1 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v2, 11 +; GFX1200-FAKE16-NEXT: s_cbranch_scc1 .LBB9_5 +; GFX1200-FAKE16-NEXT: ; %bb.6: ; %Flow55 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, s5 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v2, v5 +; GFX1200-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX1200-FAKE16-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX1200-FAKE16-NEXT: v_rndne_f32_e32 v3, v3 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v5, v3 +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v2, v3, v1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4 +; GFX1200-FAKE16-NEXT: .LBB9_8: +; GFX1200-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX1200-FAKE16-NEXT: s_lshr_b32 s5, s3, 16 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_and_b32 s4, s6, 0x7fff +; GFX1200-FAKE16-NEXT: s_and_b32 s7, s5, 0x7fff +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_cvt_f32_f16 s8, s4 +; GFX1200-FAKE16-NEXT: s_cvt_f32_f16 s7, s7 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s8, s7 +; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB9_10 +; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s8, s7 +; GFX1200-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s6 +; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v1, s6, v1, vcc_lo +; GFX1200-FAKE16-NEXT: s_cbranch_execz .LBB9_11 +; GFX1200-FAKE16-NEXT: s_branch .LBB9_16 +; GFX1200-FAKE16-NEXT: .LBB9_10: +; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr1 +; GFX1200-FAKE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s7 +; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s8 +; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v2, 1 +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v3, v1, 11 +; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v1, s7 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX1200-FAKE16-NEXT: v_div_scale_f32 v6, null, v2, v2, 1.0 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v7, v6 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v4, v4 -; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 -; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v3, v4, v5 -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v5, v5 +; GFX1200-FAKE16-NEXT: v_not_b32_e32 v5, v1 +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v4 +; GFX1200-FAKE16-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0 +; GFX1200-FAKE16-NEXT: s_denorm_mode 15 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v5 -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_fma_f32 v8, -v6, v7, 1.0 +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v7, v8, v7 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v4, v6, v5 -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v8, v4, v7 +; GFX1200-FAKE16-NEXT: v_fma_f32 v9, -v6, v8, v4 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v5, v6, v5 -; GFX1200-FAKE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5 +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v8, v9, v7 +; GFX1200-FAKE16-NEXT: v_fma_f32 v4, -v6, v8, v4 +; GFX1200-FAKE16-NEXT: s_denorm_mode 12 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-FAKE16-NEXT: v_div_fmas_f32 v4, v4, v7, v8 +; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 +; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB9_15 +; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1200-FAKE16-NEXT: s_sub_co_i32 s7, s8, s7 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_add_co_i32 s7, s7, 11 +; GFX1200-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_add_co_i32 s7, s7, -11 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_cmp_gt_i32 s7, 11 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v3, v6, v4 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_add_f32_e32 v4, v5, v4 -; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX1200-FAKE16-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v2, v1 -; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v4, v4 +; GFX1200-FAKE16-NEXT: v_fma_f32 v3, v3, v2, v6 +; GFX1200-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v5, v3, v2 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 -; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v4, v2 +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v3, v3, 11 +; GFX1200-FAKE16-NEXT: s_cbranch_scc1 .LBB9_13 +; GFX1200-FAKE16-NEXT: ; %bb.14: ; %Flow +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, s7 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v3, v6 +; GFX1200-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX1200-FAKE16-NEXT: v_rndne_f32_e32 v4, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v3, v4, v2 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, s6 +; GFX1200-FAKE16-NEXT: .LBB9_16: ; %Flow54 +; GFX1200-FAKE16-NEXT: s_cmp_lg_f16 s3, 0 +; GFX1200-FAKE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-FAKE16-NEXT: s_cmp_nge_f16 s2, 0x7c00 +; GFX1200-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-FAKE16-NEXT: s_and_b32 vcc_lo, s2, s3 +; GFX1200-FAKE16-NEXT: s_cmp_lg_f16 s5, 0 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX1200-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1200-FAKE16-NEXT: s_cmp_nge_f16 s4, 0x7c00 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1200-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-FAKE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-FAKE16-NEXT: s_and_b32 vcc_lo, s3, s2 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0x7e00, v1 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v3 -; GFX1200-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1200-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX1200-FAKE16-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1200-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4 @@ -2971,207 +7162,809 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-LABEL: frem_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s8 -; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s10 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: buffer_load_dwordx2 v[7:8], off, s[0:3], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 -; SI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 -; SI-NEXT: v_rcp_f32_e32 v10, v9 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_and_b32_e32 v9, 0x7fffffff, v6 +; SI-NEXT: v_and_b32_e32 v10, 0x7fffffff, v7 +; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v6|, |v7| +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 +; SI-NEXT: s_cbranch_vccz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: v_bfi_b32 v11, s0, 0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v9, v10 +; SI-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB10_3 +; SI-NEXT: s_branch .LBB10_8 +; SI-NEXT: .LBB10_2: +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB10_3: ; %frem.compute +; SI-NEXT: s_mov_b32 s3, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v9|, s3 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v8, v9 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s0, v8 +; SI-NEXT: s_cselect_b32 s2, s0, 0 +; SI-NEXT: v_frexp_mant_f32_e32 v8, v9 +; SI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; SI-NEXT: v_ldexp_f32_e64 v9, v8, 11 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v10|, s3 +; SI-NEXT: v_frexp_mant_f32_e32 v8, v10 +; SI-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc +; SI-NEXT: v_frexp_exp_i32_f32_e32 v10, v10 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s0, v10 +; SI-NEXT: s_cselect_b32 s3, s0, 0 +; SI-NEXT: s_add_i32 s0, s3, -1 +; SI-NEXT: v_ldexp_f32_e64 v8, v8, 1 +; SI-NEXT: s_not_b32 s1, s0 +; SI-NEXT: s_add_i32 s1, s1, s2 +; SI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v8, 1.0 +; SI-NEXT: v_div_scale_f32 v11, s[4:5], v8, v8, 1.0 +; SI-NEXT: v_rcp_f32_e32 v12, v11 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 -; SI-NEXT: v_fma_f32 v10, v11, v10, v10 -; SI-NEXT: v_mul_f32_e32 v11, v8, v10 -; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 -; SI-NEXT: v_fma_f32 v11, v12, v10, v11 -; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 +; SI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 +; SI-NEXT: v_fma_f32 v12, v13, v12, v12 +; SI-NEXT: v_mul_f32_e32 v13, v10, v12 +; SI-NEXT: v_fma_f32 v14, -v11, v13, v10 +; SI-NEXT: v_fma_f32 v13, v14, v12, v13 +; SI-NEXT: v_fma_f32 v10, -v11, v13, v10 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 -; SI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 -; SI-NEXT: v_trunc_f32_e32 v8, v8 -; SI-NEXT: v_fma_f32 v1, -v8, v1, v5 -; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 -; SI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 -; SI-NEXT: v_rcp_f32_e32 v9, v8 +; SI-NEXT: v_div_fmas_f32 v10, v10, v12, v13 +; SI-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0 +; SI-NEXT: s_cmp_lt_i32 s1, 12 +; SI-NEXT: s_cbranch_scc1 .LBB10_7 +; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: s_sub_i32 s1, s2, s3 +; SI-NEXT: s_add_i32 s1, s1, 11 +; SI-NEXT: .LBB10_5: ; %frem.loop_body +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v11, v9 +; SI-NEXT: v_mul_f32_e32 v9, v11, v10 +; SI-NEXT: v_rndne_f32_e32 v9, v9 +; SI-NEXT: v_fma_f32 v9, -v9, v8, v11 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v9 +; SI-NEXT: v_add_f32_e32 v12, v9, v8 +; SI-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc +; SI-NEXT: v_ldexp_f32_e64 v9, v9, 11 +; SI-NEXT: s_add_i32 s1, s1, -11 +; SI-NEXT: s_cmp_gt_i32 s1, 11 +; SI-NEXT: s_cbranch_scc1 .LBB10_5 +; SI-NEXT: ; %bb.6: ; %Flow133 +; SI-NEXT: v_mov_b32_e32 v9, v11 +; SI-NEXT: .LBB10_7: ; %frem.loop_exit +; SI-NEXT: s_add_i32 s1, s1, -10 +; SI-NEXT: v_ldexp_f32_e64 v9, v9, s1 +; SI-NEXT: v_mul_f32_e32 v10, v9, v10 +; SI-NEXT: v_rndne_f32_e32 v10, v10 +; SI-NEXT: v_fma_f32 v9, -v10, v8, v9 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v9 +; SI-NEXT: v_add_f32_e32 v8, v9, v8 +; SI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; SI-NEXT: v_ldexp_f32_e64 v8, v8, s0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_bfi_b32 v8, s0, v8, v6 +; SI-NEXT: .LBB10_8: +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v5 +; SI-NEXT: v_cvt_f32_f16_e64 v10, |v9| +; SI-NEXT: v_cvt_f32_f16_e64 v11, |v11| +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v10, v11 +; SI-NEXT: s_cbranch_vccz .LBB10_10 +; SI-NEXT: ; %bb.9: ; %frem.else20 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_bfi_b32 v12, s0, 0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v10, v11 +; SI-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB10_11 +; SI-NEXT: s_branch .LBB10_16 +; SI-NEXT: .LBB10_10: +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB10_11: ; %frem.compute19 +; SI-NEXT: s_mov_b32 s3, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v10|, s3 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v9, v10 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s0, v9 +; SI-NEXT: s_cselect_b32 s2, s0, 0 +; SI-NEXT: v_frexp_mant_f32_e32 v9, v10 +; SI-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; SI-NEXT: v_ldexp_f32_e64 v10, v9, 11 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v11|, s3 +; SI-NEXT: v_frexp_mant_f32_e32 v9, v11 +; SI-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc +; SI-NEXT: v_frexp_exp_i32_f32_e32 v11, v11 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s0, v11 +; SI-NEXT: s_cselect_b32 s3, s0, 0 +; SI-NEXT: s_add_i32 s0, s3, -1 +; SI-NEXT: v_ldexp_f32_e64 v9, v9, 1 +; SI-NEXT: s_not_b32 s1, s0 +; SI-NEXT: s_add_i32 s1, s1, s2 +; SI-NEXT: v_div_scale_f32 v11, vcc, 1.0, v9, 1.0 +; SI-NEXT: v_div_scale_f32 v12, s[4:5], v9, v9, 1.0 +; SI-NEXT: v_rcp_f32_e32 v13, v12 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 -; SI-NEXT: v_fma_f32 v9, v10, v9, v9 -; SI-NEXT: v_mul_f32_e32 v10, v5, v9 -; SI-NEXT: v_fma_f32 v11, -v8, v10, v5 -; SI-NEXT: v_fma_f32 v10, v11, v9, v10 -; SI-NEXT: v_fma_f32 v5, -v8, v10, v5 +; SI-NEXT: v_fma_f32 v14, -v12, v13, 1.0 +; SI-NEXT: v_fma_f32 v13, v14, v13, v13 +; SI-NEXT: v_mul_f32_e32 v14, v11, v13 +; SI-NEXT: v_fma_f32 v15, -v12, v14, v11 +; SI-NEXT: v_fma_f32 v14, v15, v13, v14 +; SI-NEXT: v_fma_f32 v11, -v12, v14, v11 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 -; SI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 -; SI-NEXT: v_trunc_f32_e32 v5, v5 -; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 -; SI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 -; SI-NEXT: v_rcp_f32_e32 v7, v5 +; SI-NEXT: v_div_fmas_f32 v11, v11, v13, v14 +; SI-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 +; SI-NEXT: s_cmp_lt_i32 s1, 12 +; SI-NEXT: s_cbranch_scc1 .LBB10_15 +; SI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; SI-NEXT: s_sub_i32 s1, s2, s3 +; SI-NEXT: s_add_i32 s1, s1, 11 +; SI-NEXT: .LBB10_13: ; %frem.loop_body27 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v12, v10 +; SI-NEXT: v_mul_f32_e32 v10, v12, v11 +; SI-NEXT: v_rndne_f32_e32 v10, v10 +; SI-NEXT: v_fma_f32 v10, -v10, v9, v12 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 +; SI-NEXT: v_add_f32_e32 v13, v10, v9 +; SI-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; SI-NEXT: v_ldexp_f32_e64 v10, v10, 11 +; SI-NEXT: s_add_i32 s1, s1, -11 +; SI-NEXT: s_cmp_gt_i32 s1, 11 +; SI-NEXT: s_cbranch_scc1 .LBB10_13 +; SI-NEXT: ; %bb.14: ; %Flow129 +; SI-NEXT: v_mov_b32_e32 v10, v12 +; SI-NEXT: .LBB10_15: ; %frem.loop_exit28 +; SI-NEXT: s_add_i32 s1, s1, -10 +; SI-NEXT: v_ldexp_f32_e64 v10, v10, s1 +; SI-NEXT: v_mul_f32_e32 v11, v10, v11 +; SI-NEXT: v_rndne_f32_e32 v11, v11 +; SI-NEXT: v_fma_f32 v10, -v11, v9, v10 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 +; SI-NEXT: v_add_f32_e32 v9, v10, v9 +; SI-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; SI-NEXT: v_ldexp_f32_e64 v9, v9, s0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_bfi_b32 v9, s0, v9, v4 +; SI-NEXT: .LBB10_16: +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f32_f16_e64 v11, |v10| +; SI-NEXT: v_cvt_f32_f16_e64 v12, |v12| +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v12 +; SI-NEXT: s_cbranch_vccz .LBB10_18 +; SI-NEXT: ; %bb.17: ; %frem.else53 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_bfi_b32 v13, s0, 0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v11, v12 +; SI-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB10_19 +; SI-NEXT: s_branch .LBB10_24 +; SI-NEXT: .LBB10_18: +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB10_19: ; %frem.compute52 +; SI-NEXT: s_mov_b32 s3, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v11|, s3 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v10, v11 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s0, v10 +; SI-NEXT: s_cselect_b32 s2, s0, 0 +; SI-NEXT: v_frexp_mant_f32_e32 v10, v11 +; SI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; SI-NEXT: v_ldexp_f32_e64 v11, v10, 11 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v12|, s3 +; SI-NEXT: v_frexp_mant_f32_e32 v10, v12 +; SI-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc +; SI-NEXT: v_frexp_exp_i32_f32_e32 v12, v12 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s0, v12 +; SI-NEXT: s_cselect_b32 s3, s0, 0 +; SI-NEXT: s_add_i32 s0, s3, -1 +; SI-NEXT: v_ldexp_f32_e64 v10, v10, 1 +; SI-NEXT: s_not_b32 s1, s0 +; SI-NEXT: s_add_i32 s1, s1, s2 +; SI-NEXT: v_div_scale_f32 v12, vcc, 1.0, v10, 1.0 +; SI-NEXT: v_div_scale_f32 v13, s[4:5], v10, v10, 1.0 +; SI-NEXT: v_rcp_f32_e32 v14, v13 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 -; SI-NEXT: v_fma_f32 v7, v8, v7, v7 -; SI-NEXT: v_mul_f32_e32 v8, v4, v7 -; SI-NEXT: v_fma_f32 v9, -v5, v8, v4 -; SI-NEXT: v_fma_f32 v8, v9, v7, v8 -; SI-NEXT: v_fma_f32 v4, -v5, v8, v4 +; SI-NEXT: v_fma_f32 v15, -v13, v14, 1.0 +; SI-NEXT: v_fma_f32 v14, v15, v14, v14 +; SI-NEXT: v_mul_f32_e32 v15, v12, v14 +; SI-NEXT: v_fma_f32 v16, -v13, v15, v12 +; SI-NEXT: v_fma_f32 v15, v16, v14, v15 +; SI-NEXT: v_fma_f32 v12, -v13, v15, v12 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 -; SI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 -; SI-NEXT: v_trunc_f32_e32 v4, v4 -; SI-NEXT: v_fma_f32 v0, -v4, v0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 -; SI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 -; SI-NEXT: v_rcp_f32_e32 v5, v4 +; SI-NEXT: v_div_fmas_f32 v12, v12, v14, v15 +; SI-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0 +; SI-NEXT: s_cmp_lt_i32 s1, 12 +; SI-NEXT: s_cbranch_scc1 .LBB10_23 +; SI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; SI-NEXT: s_sub_i32 s1, s2, s3 +; SI-NEXT: s_add_i32 s1, s1, 11 +; SI-NEXT: .LBB10_21: ; %frem.loop_body60 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v13, v11 +; SI-NEXT: v_mul_f32_e32 v11, v13, v12 +; SI-NEXT: v_rndne_f32_e32 v11, v11 +; SI-NEXT: v_fma_f32 v11, -v11, v10, v13 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 +; SI-NEXT: v_add_f32_e32 v14, v11, v10 +; SI-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; SI-NEXT: v_ldexp_f32_e64 v11, v11, 11 +; SI-NEXT: s_add_i32 s1, s1, -11 +; SI-NEXT: s_cmp_gt_i32 s1, 11 +; SI-NEXT: s_cbranch_scc1 .LBB10_21 +; SI-NEXT: ; %bb.22: ; %Flow125 +; SI-NEXT: v_mov_b32_e32 v11, v13 +; SI-NEXT: .LBB10_23: ; %frem.loop_exit61 +; SI-NEXT: s_add_i32 s1, s1, -10 +; SI-NEXT: v_ldexp_f32_e64 v11, v11, s1 +; SI-NEXT: v_mul_f32_e32 v12, v11, v12 +; SI-NEXT: v_rndne_f32_e32 v12, v12 +; SI-NEXT: v_fma_f32 v11, -v12, v10, v11 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 +; SI-NEXT: v_add_f32_e32 v10, v11, v10 +; SI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; SI-NEXT: v_ldexp_f32_e64 v10, v10, s0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_bfi_b32 v10, s0, v10, v2 +; SI-NEXT: .LBB10_24: +; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e64 v12, |v11| +; SI-NEXT: v_cvt_f32_f16_e64 v13, |v13| +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v12, v13 +; SI-NEXT: s_cbranch_vccz .LBB10_26 +; SI-NEXT: ; %bb.25: ; %frem.else86 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_bfi_b32 v14, s0, 0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v12, v13 +; SI-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB10_27 +; SI-NEXT: s_branch .LBB10_32 +; SI-NEXT: .LBB10_26: +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB10_27: ; %frem.compute85 +; SI-NEXT: s_mov_b32 s3, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v12|, s3 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v11, v12 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s0, v11 +; SI-NEXT: s_cselect_b32 s2, s0, 0 +; SI-NEXT: v_frexp_mant_f32_e32 v11, v12 +; SI-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc +; SI-NEXT: v_ldexp_f32_e64 v12, v11, 11 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v13|, s3 +; SI-NEXT: v_frexp_mant_f32_e32 v11, v13 +; SI-NEXT: v_cndmask_b32_e32 v11, v13, v11, vcc +; SI-NEXT: v_frexp_exp_i32_f32_e32 v13, v13 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s0, v13 +; SI-NEXT: s_cselect_b32 s3, s0, 0 +; SI-NEXT: s_add_i32 s0, s3, -1 +; SI-NEXT: v_ldexp_f32_e64 v11, v11, 1 +; SI-NEXT: s_not_b32 s1, s0 +; SI-NEXT: s_add_i32 s1, s1, s2 +; SI-NEXT: v_div_scale_f32 v13, vcc, 1.0, v11, 1.0 +; SI-NEXT: v_div_scale_f32 v14, s[4:5], v11, v11, 1.0 +; SI-NEXT: v_rcp_f32_e32 v15, v14 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 -; SI-NEXT: v_fma_f32 v5, v7, v5, v5 -; SI-NEXT: v_mul_f32_e32 v7, v3, v5 -; SI-NEXT: v_fma_f32 v8, -v4, v7, v3 -; SI-NEXT: v_fma_f32 v7, v8, v5, v7 -; SI-NEXT: v_fma_f32 v3, -v4, v7, v3 +; SI-NEXT: v_fma_f32 v16, -v14, v15, 1.0 +; SI-NEXT: v_fma_f32 v15, v16, v15, v15 +; SI-NEXT: v_mul_f32_e32 v16, v13, v15 +; SI-NEXT: v_fma_f32 v17, -v14, v16, v13 +; SI-NEXT: v_fma_f32 v16, v17, v15, v16 +; SI-NEXT: v_fma_f32 v13, -v14, v16, v13 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 -; SI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 -; SI-NEXT: v_trunc_f32_e32 v3, v3 -; SI-NEXT: v_fma_f32 v2, -v3, v6, v2 +; SI-NEXT: v_div_fmas_f32 v13, v13, v15, v16 +; SI-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0 +; SI-NEXT: s_cmp_lt_i32 s1, 12 +; SI-NEXT: s_cbranch_scc1 .LBB10_31 +; SI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; SI-NEXT: s_sub_i32 s1, s2, s3 +; SI-NEXT: s_add_i32 s1, s1, 11 +; SI-NEXT: .LBB10_29: ; %frem.loop_body93 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v14, v12 +; SI-NEXT: v_mul_f32_e32 v12, v14, v13 +; SI-NEXT: v_rndne_f32_e32 v12, v12 +; SI-NEXT: v_fma_f32 v12, -v12, v11, v14 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 +; SI-NEXT: v_add_f32_e32 v15, v12, v11 +; SI-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc +; SI-NEXT: v_ldexp_f32_e64 v12, v12, 11 +; SI-NEXT: s_add_i32 s1, s1, -11 +; SI-NEXT: s_cmp_gt_i32 s1, 11 +; SI-NEXT: s_cbranch_scc1 .LBB10_29 +; SI-NEXT: ; %bb.30: ; %Flow +; SI-NEXT: v_mov_b32_e32 v12, v14 +; SI-NEXT: .LBB10_31: ; %frem.loop_exit94 +; SI-NEXT: s_add_i32 s1, s1, -10 +; SI-NEXT: v_ldexp_f32_e64 v12, v12, s1 +; SI-NEXT: v_mul_f32_e32 v13, v12, v13 +; SI-NEXT: v_rndne_f32_e32 v13, v13 +; SI-NEXT: v_fma_f32 v12, -v13, v11, v12 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 +; SI-NEXT: v_add_f32_e32 v11, v12, v11 +; SI-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc +; SI-NEXT: v_ldexp_f32_e64 v11, v11, s0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_bfi_b32 v11, s0, v11, v0 +; SI-NEXT: .LBB10_32: ; %Flow124 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0x7fff, v6 +; SI-NEXT: s_movk_i32 s2, 0x7c00 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v6 +; SI-NEXT: s_or_b64 vcc, s[0:1], vcc +; SI-NEXT: v_mov_b32_e32 v6, 0x7fc00000 +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_and_b32_e32 v4, 0x7fff, v4 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 +; SI-NEXT: s_or_b64 vcc, s[0:1], vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v2 +; SI-NEXT: s_or_b64 vcc, s[0:1], vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v0 +; SI-NEXT: s_or_b64 vcc, s[0:1], vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s6, s2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s2, s6 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s8 -; CI-NEXT: s_mov_b32 s1, s9 -; CI-NEXT: s_mov_b32 s8, s10 -; CI-NEXT: s_mov_b32 s9, s11 -; CI-NEXT: s_mov_b32 s10, s2 -; CI-NEXT: s_mov_b32 s11, s3 -; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: s_mov_b32 s4, s10 +; CI-NEXT: s_mov_b32 s5, s11 +; CI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_load_dwordx2 v[7:8], off, s[0:3], 0 offset:32 +; CI-NEXT: s_brev_b32 s0, -2 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 -; CI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 -; CI-NEXT: v_rcp_f32_e32 v10, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v6 +; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v6|, |v7| +; CI-NEXT: v_and_b32_e32 v10, 0x7fffffff, v6 +; CI-NEXT: v_and_b32_e32 v9, 0x7fffffff, v7 +; CI-NEXT: s_and_b64 vcc, exec, s[2:3] +; CI-NEXT: s_cbranch_vccz .LBB10_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_bfi_b32 v11, s0, 0, v6 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v10, v9 +; CI-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; CI-NEXT: s_cbranch_execz .LBB10_3 +; CI-NEXT: s_branch .LBB10_8 +; CI-NEXT: .LBB10_2: +; CI-NEXT: ; implicit-def: $vgpr8 +; CI-NEXT: .LBB10_3: ; %frem.compute +; CI-NEXT: v_frexp_exp_i32_f32_e32 v13, v10 +; CI-NEXT: v_frexp_mant_f32_e32 v8, v10 +; CI-NEXT: v_frexp_mant_f32_e32 v10, v9 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v14, v9 +; CI-NEXT: v_ldexp_f32_e64 v9, v10, 1 +; CI-NEXT: v_div_scale_f32 v15, s[0:1], v9, v9, 1.0 +; CI-NEXT: v_ldexp_f32_e64 v11, v8, 11 +; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v14 +; CI-NEXT: v_not_b32_e32 v10, v8 +; CI-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CI-NEXT: v_div_scale_f32 v12, vcc, 1.0, v9, 1.0 +; CI-NEXT: v_rcp_f32_e32 v16, v15 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 -; CI-NEXT: v_fma_f32 v10, v11, v10, v10 -; CI-NEXT: v_mul_f32_e32 v11, v8, v10 -; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 -; CI-NEXT: v_fma_f32 v11, v12, v10, v11 -; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 +; CI-NEXT: v_fma_f32 v17, -v15, v16, 1.0 +; CI-NEXT: v_fma_f32 v16, v17, v16, v16 +; CI-NEXT: v_mul_f32_e32 v17, v12, v16 +; CI-NEXT: v_fma_f32 v18, -v15, v17, v12 +; CI-NEXT: v_fma_f32 v17, v18, v16, v17 +; CI-NEXT: v_fma_f32 v12, -v15, v17, v12 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 -; CI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 -; CI-NEXT: v_trunc_f32_e32 v8, v8 -; CI-NEXT: v_fma_f32 v1, -v8, v1, v5 -; CI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 -; CI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 -; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_rcp_f32_e32 v9, v8 +; CI-NEXT: v_div_fmas_f32 v12, v12, v16, v17 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v10 +; CI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_7 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_sub_i32_e32 v10, vcc, v13, v14 +; CI-NEXT: v_add_i32_e32 v10, vcc, 11, v10 +; CI-NEXT: .LBB10_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v13, v11 +; CI-NEXT: v_mul_f32_e32 v11, v13, v12 +; CI-NEXT: v_rndne_f32_e32 v11, v11 +; CI-NEXT: v_fma_f32 v11, -v11, v9, v13 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 +; CI-NEXT: v_add_f32_e32 v14, v11, v9 +; CI-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; CI-NEXT: v_add_i32_e32 v10, vcc, -11, v10 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v10 +; CI-NEXT: v_ldexp_f32_e64 v11, v11, 11 +; CI-NEXT: s_cbranch_vccnz .LBB10_5 +; CI-NEXT: ; %bb.6: ; %Flow133 +; CI-NEXT: v_mov_b32_e32 v11, v13 +; CI-NEXT: .LBB10_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v10, vcc, -10, v10 +; CI-NEXT: v_ldexp_f32_e32 v10, v11, v10 +; CI-NEXT: v_mul_f32_e32 v11, v10, v12 +; CI-NEXT: v_rndne_f32_e32 v11, v11 +; CI-NEXT: v_fma_f32 v10, -v11, v9, v10 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 +; CI-NEXT: v_add_f32_e32 v9, v10, v9 +; CI-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; CI-NEXT: v_ldexp_f32_e32 v8, v9, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: s_brev_b32 s0, -2 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_bfi_b32 v8, s0, v8, v6 +; CI-NEXT: .LBB10_8: +; CI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; CI-NEXT: v_cvt_f32_f16_e64 v11, |v9| +; CI-NEXT: v_cvt_f32_f16_e64 v10, |v10| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 +; CI-NEXT: s_cbranch_vccz .LBB10_10 +; CI-NEXT: ; %bb.9: ; %frem.else20 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: s_brev_b32 s0, -2 +; CI-NEXT: v_bfi_b32 v12, s0, 0, v4 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v11, v10 +; CI-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc +; CI-NEXT: s_cbranch_execz .LBB10_11 +; CI-NEXT: s_branch .LBB10_16 +; CI-NEXT: .LBB10_10: +; CI-NEXT: ; implicit-def: $vgpr9 +; CI-NEXT: .LBB10_11: ; %frem.compute19 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v14, v11 +; CI-NEXT: v_frexp_mant_f32_e32 v9, v11 +; CI-NEXT: v_frexp_mant_f32_e32 v11, v10 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v15, v10 +; CI-NEXT: v_ldexp_f32_e64 v10, v11, 1 +; CI-NEXT: v_div_scale_f32 v16, s[0:1], v10, v10, 1.0 +; CI-NEXT: v_ldexp_f32_e64 v12, v9, 11 +; CI-NEXT: v_add_i32_e32 v9, vcc, -1, v15 +; CI-NEXT: v_not_b32_e32 v11, v9 +; CI-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CI-NEXT: v_div_scale_f32 v13, vcc, 1.0, v10, 1.0 +; CI-NEXT: v_rcp_f32_e32 v17, v16 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 -; CI-NEXT: v_fma_f32 v9, v10, v9, v9 -; CI-NEXT: v_mul_f32_e32 v10, v5, v9 -; CI-NEXT: v_fma_f32 v11, -v8, v10, v5 -; CI-NEXT: v_fma_f32 v10, v11, v9, v10 -; CI-NEXT: v_fma_f32 v5, -v8, v10, v5 +; CI-NEXT: v_fma_f32 v18, -v16, v17, 1.0 +; CI-NEXT: v_fma_f32 v17, v18, v17, v17 +; CI-NEXT: v_mul_f32_e32 v18, v13, v17 +; CI-NEXT: v_fma_f32 v19, -v16, v18, v13 +; CI-NEXT: v_fma_f32 v18, v19, v17, v18 +; CI-NEXT: v_fma_f32 v13, -v16, v18, v13 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 -; CI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 -; CI-NEXT: v_trunc_f32_e32 v5, v5 -; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 -; CI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_or_b32_e32 v1, v4, v1 -; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 -; CI-NEXT: v_rcp_f32_e32 v7, v5 +; CI-NEXT: v_div_fmas_f32 v13, v13, v17, v18 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v11 +; CI-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_15 +; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; CI-NEXT: v_sub_i32_e32 v11, vcc, v14, v15 +; CI-NEXT: v_add_i32_e32 v11, vcc, 11, v11 +; CI-NEXT: .LBB10_13: ; %frem.loop_body27 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v14, v12 +; CI-NEXT: v_mul_f32_e32 v12, v14, v13 +; CI-NEXT: v_rndne_f32_e32 v12, v12 +; CI-NEXT: v_fma_f32 v12, -v12, v10, v14 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 +; CI-NEXT: v_add_f32_e32 v15, v12, v10 +; CI-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc +; CI-NEXT: v_add_i32_e32 v11, vcc, -11, v11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v11 +; CI-NEXT: v_ldexp_f32_e64 v12, v12, 11 +; CI-NEXT: s_cbranch_vccnz .LBB10_13 +; CI-NEXT: ; %bb.14: ; %Flow129 +; CI-NEXT: v_mov_b32_e32 v12, v14 +; CI-NEXT: .LBB10_15: ; %frem.loop_exit28 +; CI-NEXT: v_add_i32_e32 v11, vcc, -10, v11 +; CI-NEXT: v_ldexp_f32_e32 v11, v12, v11 +; CI-NEXT: v_mul_f32_e32 v12, v11, v13 +; CI-NEXT: v_rndne_f32_e32 v12, v12 +; CI-NEXT: v_fma_f32 v11, -v12, v10, v11 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 +; CI-NEXT: v_add_f32_e32 v10, v11, v10 +; CI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; CI-NEXT: v_ldexp_f32_e32 v9, v10, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: s_brev_b32 s0, -2 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_bfi_b32 v9, s0, v9, v4 +; CI-NEXT: .LBB10_16: +; CI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; CI-NEXT: v_cvt_f32_f16_e64 v12, |v10| +; CI-NEXT: v_cvt_f32_f16_e64 v11, |v11| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v12, v11 +; CI-NEXT: s_cbranch_vccz .LBB10_18 +; CI-NEXT: ; %bb.17: ; %frem.else53 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: s_brev_b32 s0, -2 +; CI-NEXT: v_bfi_b32 v13, s0, 0, v2 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v12, v11 +; CI-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; CI-NEXT: s_cbranch_execz .LBB10_19 +; CI-NEXT: s_branch .LBB10_24 +; CI-NEXT: .LBB10_18: +; CI-NEXT: ; implicit-def: $vgpr10 +; CI-NEXT: .LBB10_19: ; %frem.compute52 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v15, v12 +; CI-NEXT: v_frexp_mant_f32_e32 v10, v12 +; CI-NEXT: v_frexp_mant_f32_e32 v12, v11 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v16, v11 +; CI-NEXT: v_ldexp_f32_e64 v11, v12, 1 +; CI-NEXT: v_div_scale_f32 v17, s[0:1], v11, v11, 1.0 +; CI-NEXT: v_ldexp_f32_e64 v13, v10, 11 +; CI-NEXT: v_add_i32_e32 v10, vcc, -1, v16 +; CI-NEXT: v_not_b32_e32 v12, v10 +; CI-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CI-NEXT: v_div_scale_f32 v14, vcc, 1.0, v11, 1.0 +; CI-NEXT: v_rcp_f32_e32 v18, v17 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 -; CI-NEXT: v_fma_f32 v7, v8, v7, v7 -; CI-NEXT: v_mul_f32_e32 v8, v4, v7 -; CI-NEXT: v_fma_f32 v9, -v5, v8, v4 -; CI-NEXT: v_fma_f32 v8, v9, v7, v8 -; CI-NEXT: v_fma_f32 v4, -v5, v8, v4 +; CI-NEXT: v_fma_f32 v19, -v17, v18, 1.0 +; CI-NEXT: v_fma_f32 v18, v19, v18, v18 +; CI-NEXT: v_mul_f32_e32 v19, v14, v18 +; CI-NEXT: v_fma_f32 v20, -v17, v19, v14 +; CI-NEXT: v_fma_f32 v19, v20, v18, v19 +; CI-NEXT: v_fma_f32 v14, -v17, v19, v14 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 -; CI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 -; CI-NEXT: v_trunc_f32_e32 v4, v4 -; CI-NEXT: v_fma_f32 v0, -v4, v0, v3 -; CI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 -; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_rcp_f32_e32 v5, v4 +; CI-NEXT: v_div_fmas_f32 v14, v14, v18, v19 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v12 +; CI-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_23 +; CI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; CI-NEXT: v_sub_i32_e32 v12, vcc, v15, v16 +; CI-NEXT: v_add_i32_e32 v12, vcc, 11, v12 +; CI-NEXT: .LBB10_21: ; %frem.loop_body60 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v15, v13 +; CI-NEXT: v_mul_f32_e32 v13, v15, v14 +; CI-NEXT: v_rndne_f32_e32 v13, v13 +; CI-NEXT: v_fma_f32 v13, -v13, v11, v15 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v13 +; CI-NEXT: v_add_f32_e32 v16, v13, v11 +; CI-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc +; CI-NEXT: v_add_i32_e32 v12, vcc, -11, v12 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v12 +; CI-NEXT: v_ldexp_f32_e64 v13, v13, 11 +; CI-NEXT: s_cbranch_vccnz .LBB10_21 +; CI-NEXT: ; %bb.22: ; %Flow125 +; CI-NEXT: v_mov_b32_e32 v13, v15 +; CI-NEXT: .LBB10_23: ; %frem.loop_exit61 +; CI-NEXT: v_add_i32_e32 v12, vcc, -10, v12 +; CI-NEXT: v_ldexp_f32_e32 v12, v13, v12 +; CI-NEXT: v_mul_f32_e32 v13, v12, v14 +; CI-NEXT: v_rndne_f32_e32 v13, v13 +; CI-NEXT: v_fma_f32 v12, -v13, v11, v12 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 +; CI-NEXT: v_add_f32_e32 v11, v12, v11 +; CI-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc +; CI-NEXT: v_ldexp_f32_e32 v10, v11, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: s_brev_b32 s0, -2 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_bfi_b32 v10, s0, v10, v2 +; CI-NEXT: .LBB10_24: +; CI-NEXT: v_cvt_f16_f32_e32 v11, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v1 +; CI-NEXT: v_cvt_f32_f16_e64 v13, |v11| +; CI-NEXT: v_cvt_f32_f16_e64 v12, |v12| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12 +; CI-NEXT: s_cbranch_vccz .LBB10_26 +; CI-NEXT: ; %bb.25: ; %frem.else86 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: s_brev_b32 s0, -2 +; CI-NEXT: v_bfi_b32 v14, s0, 0, v0 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v13, v12 +; CI-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; CI-NEXT: s_cbranch_execz .LBB10_27 +; CI-NEXT: s_branch .LBB10_32 +; CI-NEXT: .LBB10_26: +; CI-NEXT: ; implicit-def: $vgpr11 +; CI-NEXT: .LBB10_27: ; %frem.compute85 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v16, v13 +; CI-NEXT: v_frexp_mant_f32_e32 v11, v13 +; CI-NEXT: v_frexp_mant_f32_e32 v13, v12 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v17, v12 +; CI-NEXT: v_ldexp_f32_e64 v12, v13, 1 +; CI-NEXT: v_div_scale_f32 v18, s[0:1], v12, v12, 1.0 +; CI-NEXT: v_ldexp_f32_e64 v14, v11, 11 +; CI-NEXT: v_add_i32_e32 v11, vcc, -1, v17 +; CI-NEXT: v_not_b32_e32 v13, v11 +; CI-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CI-NEXT: v_div_scale_f32 v15, vcc, 1.0, v12, 1.0 +; CI-NEXT: v_rcp_f32_e32 v19, v18 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 -; CI-NEXT: v_fma_f32 v5, v7, v5, v5 -; CI-NEXT: v_mul_f32_e32 v7, v3, v5 -; CI-NEXT: v_fma_f32 v8, -v4, v7, v3 -; CI-NEXT: v_fma_f32 v7, v8, v5, v7 -; CI-NEXT: v_fma_f32 v3, -v4, v7, v3 +; CI-NEXT: v_fma_f32 v20, -v18, v19, 1.0 +; CI-NEXT: v_fma_f32 v19, v20, v19, v19 +; CI-NEXT: v_mul_f32_e32 v20, v15, v19 +; CI-NEXT: v_fma_f32 v21, -v18, v20, v15 +; CI-NEXT: v_fma_f32 v20, v21, v19, v20 +; CI-NEXT: v_fma_f32 v15, -v18, v20, v15 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 -; CI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 -; CI-NEXT: v_trunc_f32_e32 v3, v3 -; CI-NEXT: v_fma_f32 v2, -v3, v6, v2 +; CI-NEXT: v_div_fmas_f32 v15, v15, v19, v20 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v13 +; CI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_31 +; CI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; CI-NEXT: v_sub_i32_e32 v13, vcc, v16, v17 +; CI-NEXT: v_add_i32_e32 v13, vcc, 11, v13 +; CI-NEXT: .LBB10_29: ; %frem.loop_body93 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v16, v14 +; CI-NEXT: v_mul_f32_e32 v14, v16, v15 +; CI-NEXT: v_rndne_f32_e32 v14, v14 +; CI-NEXT: v_fma_f32 v14, -v14, v12, v16 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v14 +; CI-NEXT: v_add_f32_e32 v17, v14, v12 +; CI-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc +; CI-NEXT: v_add_i32_e32 v13, vcc, -11, v13 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v13 +; CI-NEXT: v_ldexp_f32_e64 v14, v14, 11 +; CI-NEXT: s_cbranch_vccnz .LBB10_29 +; CI-NEXT: ; %bb.30: ; %Flow +; CI-NEXT: v_mov_b32_e32 v14, v16 +; CI-NEXT: .LBB10_31: ; %frem.loop_exit94 +; CI-NEXT: v_add_i32_e32 v13, vcc, -10, v13 +; CI-NEXT: v_ldexp_f32_e32 v13, v14, v13 +; CI-NEXT: v_mul_f32_e32 v14, v13, v15 +; CI-NEXT: v_rndne_f32_e32 v14, v14 +; CI-NEXT: v_fma_f32 v13, -v14, v12, v13 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v13 +; CI-NEXT: v_add_f32_e32 v12, v13, v12 +; CI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; CI-NEXT: v_ldexp_f32_e32 v11, v12, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: s_brev_b32 s0, -2 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_bfi_b32 v11, s0, v11, v0 +; CI-NEXT: .LBB10_32: ; %Flow124 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: s_movk_i32 s2, 0x7c00 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_and_b32_e32 v6, 0x7fff, v6 +; CI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v6 +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v8 +; CI-NEXT: s_or_b64 vcc, s[0:1], vcc +; CI-NEXT: v_mov_b32_e32 v6, 0x7fc00000 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v9 +; CI-NEXT: v_and_b32_e32 v4, 0x7fff, v4 +; CI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 +; CI-NEXT: s_or_b64 vcc, s[0:1], vcc +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_or_b32_e32 v0, v2, v0 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v10 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; CI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v2 +; CI-NEXT: s_or_b64 vcc, s[0:1], vcc +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; CI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; CI-NEXT: s_or_b64 vcc, s[0:1], vcc +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_or_b32_e32 v1, v2, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; CI-NEXT: v_or_b32_e32 v0, v7, v0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v4f16: @@ -3179,82 +7972,322 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_add_u32 s2, s4, 32 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_addc_u32 s3, s5, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; VI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; VI-NEXT: v_cvt_f32_f16_e64 v6, |v0| ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; VI-NEXT: v_rcp_f32_e32 v10, v9 -; VI-NEXT: v_mul_f32_e32 v11, v7, v10 -; VI-NEXT: v_mad_f32 v12, -v9, v11, v7 -; VI-NEXT: v_mac_f32_e32 v11, v12, v10 -; VI-NEXT: v_mad_f32 v7, -v9, v11, v7 -; VI-NEXT: v_mul_f32_e32 v7, v7, v10 -; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v7 -; VI-NEXT: v_add_f32_e32 v7, v7, v11 -; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; VI-NEXT: v_div_fixup_f16 v7, v7, v8, v6 -; VI-NEXT: v_trunc_f16_e32 v7, v7 -; VI-NEXT: v_fma_f16 v6, -v7, v8, v6 -; VI-NEXT: v_cvt_f32_f16_e32 v8, v5 -; VI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_rcp_f32_e32 v9, v8 -; VI-NEXT: v_mul_f32_e32 v10, v7, v9 -; VI-NEXT: v_mad_f32 v11, -v8, v10, v7 -; VI-NEXT: v_mac_f32_e32 v10, v11, v9 -; VI-NEXT: v_mad_f32 v7, -v8, v10, v7 -; VI-NEXT: v_mul_f32_e32 v7, v7, v9 -; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v7 -; VI-NEXT: v_add_f32_e32 v7, v7, v10 +; VI-NEXT: v_cvt_f32_f16_e64 v5, |v2| +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5 +; VI-NEXT: s_cbranch_vccz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: v_bfi_b32 v4, s2, 0, v0 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 +; VI-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; VI-NEXT: s_cbranch_execz .LBB10_3 +; VI-NEXT: s_branch .LBB10_8 +; VI-NEXT: .LBB10_2: +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: .LBB10_3: ; %frem.compute +; VI-NEXT: v_frexp_exp_i32_f32_e32 v9, v6 +; VI-NEXT: v_frexp_mant_f32_e32 v4, v6 +; VI-NEXT: v_frexp_mant_f32_e32 v6, v5 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v10, v5 +; VI-NEXT: v_ldexp_f32 v5, v6, 1 +; VI-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0 +; VI-NEXT: v_ldexp_f32 v7, v4, 11 +; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v10 +; VI-NEXT: v_not_b32_e32 v6, v4 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v9 +; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v5, 1.0 +; VI-NEXT: v_rcp_f32_e32 v12, v11 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 +; VI-NEXT: v_fma_f32 v12, v13, v12, v12 +; VI-NEXT: v_mul_f32_e32 v13, v8, v12 +; VI-NEXT: v_fma_f32 v14, -v11, v13, v8 +; VI-NEXT: v_fma_f32 v13, v14, v12, v13 +; VI-NEXT: v_fma_f32 v8, -v11, v13, v8 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v8, v8, v12, v13 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6 +; VI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB10_7 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_sub_u32_e32 v6, vcc, v9, v10 +; VI-NEXT: v_add_u32_e32 v6, vcc, 11, v6 +; VI-NEXT: .LBB10_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v9, v7 +; VI-NEXT: v_mul_f32_e32 v7, v9, v8 +; VI-NEXT: v_rndne_f32_e32 v7, v7 +; VI-NEXT: v_fma_f32 v7, -v7, v5, v9 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; VI-NEXT: v_add_f32_e32 v10, v7, v5 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, -11, v6 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v6 +; VI-NEXT: v_ldexp_f32 v7, v7, 11 +; VI-NEXT: s_cbranch_vccnz .LBB10_5 +; VI-NEXT: ; %bb.6: ; %Flow133 +; VI-NEXT: v_mov_b32_e32 v7, v9 +; VI-NEXT: .LBB10_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v6, vcc, -10, v6 +; VI-NEXT: v_ldexp_f32 v6, v7, v6 +; VI-NEXT: v_mul_f32_e32 v7, v6, v8 +; VI-NEXT: v_rndne_f32_e32 v7, v7 +; VI-NEXT: v_fma_f32 v6, -v7, v5, v6 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; VI-NEXT: v_add_f32_e32 v5, v6, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; VI-NEXT: v_ldexp_f32 v4, v5, v4 +; VI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: v_bfi_b32 v4, s2, v4, v0 +; VI-NEXT: .LBB10_8: +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; VI-NEXT: v_cvt_f32_f16_e64 v9, |v5| +; VI-NEXT: v_cvt_f32_f16_e64 v8, |v6| +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8 +; VI-NEXT: s_cbranch_vccz .LBB10_10 +; VI-NEXT: ; %bb.9: ; %frem.else20 +; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: v_bfi_b32 v7, s2, 0, v5 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v9, v8 +; VI-NEXT: v_cndmask_b32_e32 v7, v5, v7, vcc +; VI-NEXT: s_cbranch_execz .LBB10_11 +; VI-NEXT: s_branch .LBB10_16 +; VI-NEXT: .LBB10_10: +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: .LBB10_11: ; %frem.compute19 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v12, v9 +; VI-NEXT: v_frexp_mant_f32_e32 v7, v9 +; VI-NEXT: v_frexp_mant_f32_e32 v9, v8 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v13, v8 +; VI-NEXT: v_ldexp_f32 v8, v9, 1 +; VI-NEXT: v_div_scale_f32 v14, s[2:3], v8, v8, 1.0 +; VI-NEXT: v_ldexp_f32 v10, v7, 11 +; VI-NEXT: v_add_u32_e32 v7, vcc, -1, v13 +; VI-NEXT: v_not_b32_e32 v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v12 +; VI-NEXT: v_div_scale_f32 v11, vcc, 1.0, v8, 1.0 +; VI-NEXT: v_rcp_f32_e32 v15, v14 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v16, -v14, v15, 1.0 +; VI-NEXT: v_fma_f32 v15, v16, v15, v15 +; VI-NEXT: v_mul_f32_e32 v16, v11, v15 +; VI-NEXT: v_fma_f32 v17, -v14, v16, v11 +; VI-NEXT: v_fma_f32 v16, v17, v15, v16 +; VI-NEXT: v_fma_f32 v11, -v14, v16, v11 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v11, v11, v15, v16 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v9 +; VI-NEXT: v_div_fixup_f32 v11, v11, v8, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB10_15 +; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; VI-NEXT: v_sub_u32_e32 v9, vcc, v12, v13 +; VI-NEXT: v_add_u32_e32 v9, vcc, 11, v9 +; VI-NEXT: .LBB10_13: ; %frem.loop_body27 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v12, v10 +; VI-NEXT: v_mul_f32_e32 v10, v12, v11 +; VI-NEXT: v_rndne_f32_e32 v10, v10 +; VI-NEXT: v_fma_f32 v10, -v10, v8, v12 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 +; VI-NEXT: v_add_f32_e32 v13, v10, v8 +; VI-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, -11, v9 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v9 +; VI-NEXT: v_ldexp_f32 v10, v10, 11 +; VI-NEXT: s_cbranch_vccnz .LBB10_13 +; VI-NEXT: ; %bb.14: ; %Flow129 +; VI-NEXT: v_mov_b32_e32 v10, v12 +; VI-NEXT: .LBB10_15: ; %frem.loop_exit28 +; VI-NEXT: v_add_u32_e32 v9, vcc, -10, v9 +; VI-NEXT: v_ldexp_f32 v9, v10, v9 +; VI-NEXT: v_mul_f32_e32 v10, v9, v11 +; VI-NEXT: v_rndne_f32_e32 v10, v10 +; VI-NEXT: v_fma_f32 v9, -v10, v8, v9 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v9 +; VI-NEXT: v_add_f32_e32 v8, v9, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; VI-NEXT: v_ldexp_f32 v7, v8, v7 ; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; VI-NEXT: v_div_fixup_f16 v7, v7, v5, v3 -; VI-NEXT: v_trunc_f16_e32 v7, v7 -; VI-NEXT: v_fma_f16 v3, -v7, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-NEXT: v_or_b32_e32 v3, v3, v6 -; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; VI-NEXT: v_rcp_f32_e32 v9, v8 -; VI-NEXT: v_mul_f32_e32 v10, v6, v9 -; VI-NEXT: v_mad_f32 v11, -v8, v10, v6 -; VI-NEXT: v_mac_f32_e32 v10, v11, v9 -; VI-NEXT: v_mad_f32 v6, -v8, v10, v6 -; VI-NEXT: v_mul_f32_e32 v6, v6, v9 -; VI-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; VI-NEXT: v_add_f32_e32 v6, v6, v10 -; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; VI-NEXT: v_div_fixup_f16 v6, v6, v7, v5 -; VI-NEXT: v_trunc_f16_e32 v6, v6 -; VI-NEXT: v_fma_f16 v5, -v6, v7, v5 -; VI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_rcp_f32_e32 v8, v7 -; VI-NEXT: v_mul_f32_e32 v9, v6, v8 -; VI-NEXT: v_mad_f32 v10, -v7, v9, v6 -; VI-NEXT: v_mac_f32_e32 v9, v10, v8 -; VI-NEXT: v_mad_f32 v6, -v7, v9, v6 -; VI-NEXT: v_mul_f32_e32 v6, v6, v8 -; VI-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; VI-NEXT: v_add_f32_e32 v6, v6, v9 -; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; VI-NEXT: v_div_fixup_f16 v6, v6, v4, v2 -; VI-NEXT: v_trunc_f16_e32 v6, v6 -; VI-NEXT: v_fma_f16 v2, -v6, v4, v2 -; VI-NEXT: v_or_b32_e32 v2, v2, v5 +; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: v_bfi_b32 v7, s2, v7, v5 +; VI-NEXT: .LBB10_16: +; VI-NEXT: v_cvt_f32_f16_e64 v10, |v1| +; VI-NEXT: v_cvt_f32_f16_e64 v9, |v3| +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v10, v9 +; VI-NEXT: s_cbranch_vccz .LBB10_18 +; VI-NEXT: ; %bb.17: ; %frem.else53 +; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: v_bfi_b32 v8, s2, 0, v1 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v10, v9 +; VI-NEXT: v_cndmask_b32_e32 v8, v1, v8, vcc +; VI-NEXT: s_cbranch_execz .LBB10_19 +; VI-NEXT: s_branch .LBB10_24 +; VI-NEXT: .LBB10_18: +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: .LBB10_19: ; %frem.compute52 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v13, v10 +; VI-NEXT: v_frexp_mant_f32_e32 v8, v10 +; VI-NEXT: v_frexp_mant_f32_e32 v10, v9 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v14, v9 +; VI-NEXT: v_ldexp_f32 v9, v10, 1 +; VI-NEXT: v_div_scale_f32 v15, s[2:3], v9, v9, 1.0 +; VI-NEXT: v_ldexp_f32 v11, v8, 11 +; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v14 +; VI-NEXT: v_not_b32_e32 v10, v8 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v13 +; VI-NEXT: v_div_scale_f32 v12, vcc, 1.0, v9, 1.0 +; VI-NEXT: v_rcp_f32_e32 v16, v15 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v17, -v15, v16, 1.0 +; VI-NEXT: v_fma_f32 v16, v17, v16, v16 +; VI-NEXT: v_mul_f32_e32 v17, v12, v16 +; VI-NEXT: v_fma_f32 v18, -v15, v17, v12 +; VI-NEXT: v_fma_f32 v17, v18, v16, v17 +; VI-NEXT: v_fma_f32 v12, -v15, v17, v12 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v12, v12, v16, v17 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v10 +; VI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB10_23 +; VI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; VI-NEXT: v_sub_u32_e32 v10, vcc, v13, v14 +; VI-NEXT: v_add_u32_e32 v10, vcc, 11, v10 +; VI-NEXT: .LBB10_21: ; %frem.loop_body60 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v13, v11 +; VI-NEXT: v_mul_f32_e32 v11, v13, v12 +; VI-NEXT: v_rndne_f32_e32 v11, v11 +; VI-NEXT: v_fma_f32 v11, -v11, v9, v13 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 +; VI-NEXT: v_add_f32_e32 v14, v11, v9 +; VI-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, -11, v10 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v10 +; VI-NEXT: v_ldexp_f32 v11, v11, 11 +; VI-NEXT: s_cbranch_vccnz .LBB10_21 +; VI-NEXT: ; %bb.22: ; %Flow125 +; VI-NEXT: v_mov_b32_e32 v11, v13 +; VI-NEXT: .LBB10_23: ; %frem.loop_exit61 +; VI-NEXT: v_add_u32_e32 v10, vcc, -10, v10 +; VI-NEXT: v_ldexp_f32 v10, v11, v10 +; VI-NEXT: v_mul_f32_e32 v11, v10, v12 +; VI-NEXT: v_rndne_f32_e32 v11, v11 +; VI-NEXT: v_fma_f32 v10, -v11, v9, v10 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 +; VI-NEXT: v_add_f32_e32 v9, v10, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; VI-NEXT: v_ldexp_f32 v8, v9, v8 +; VI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: v_bfi_b32 v8, s2, v8, v1 +; VI-NEXT: .LBB10_24: +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; VI-NEXT: v_cvt_f32_f16_e64 v13, |v9| +; VI-NEXT: v_cvt_f32_f16_e64 v12, |v10| +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12 +; VI-NEXT: s_cbranch_vccz .LBB10_26 +; VI-NEXT: ; %bb.25: ; %frem.else86 +; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: v_bfi_b32 v11, s2, 0, v9 +; VI-NEXT: v_cmp_eq_f32_e32 vcc, v13, v12 +; VI-NEXT: v_cndmask_b32_e32 v11, v9, v11, vcc +; VI-NEXT: s_cbranch_execz .LBB10_27 +; VI-NEXT: s_branch .LBB10_32 +; VI-NEXT: .LBB10_26: +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: .LBB10_27: ; %frem.compute85 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v16, v13 +; VI-NEXT: v_frexp_mant_f32_e32 v11, v13 +; VI-NEXT: v_frexp_mant_f32_e32 v13, v12 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v17, v12 +; VI-NEXT: v_ldexp_f32 v12, v13, 1 +; VI-NEXT: v_div_scale_f32 v18, s[2:3], v12, v12, 1.0 +; VI-NEXT: v_ldexp_f32 v14, v11, 11 +; VI-NEXT: v_add_u32_e32 v11, vcc, -1, v17 +; VI-NEXT: v_not_b32_e32 v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v16 +; VI-NEXT: v_div_scale_f32 v15, vcc, 1.0, v12, 1.0 +; VI-NEXT: v_rcp_f32_e32 v19, v18 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v20, -v18, v19, 1.0 +; VI-NEXT: v_fma_f32 v19, v20, v19, v19 +; VI-NEXT: v_mul_f32_e32 v20, v15, v19 +; VI-NEXT: v_fma_f32 v21, -v18, v20, v15 +; VI-NEXT: v_fma_f32 v20, v21, v19, v20 +; VI-NEXT: v_fma_f32 v15, -v18, v20, v15 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v15, v15, v19, v20 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v13 +; VI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB10_31 +; VI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; VI-NEXT: v_sub_u32_e32 v13, vcc, v16, v17 +; VI-NEXT: v_add_u32_e32 v13, vcc, 11, v13 +; VI-NEXT: .LBB10_29: ; %frem.loop_body93 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v16, v14 +; VI-NEXT: v_mul_f32_e32 v14, v16, v15 +; VI-NEXT: v_rndne_f32_e32 v14, v14 +; VI-NEXT: v_fma_f32 v14, -v14, v12, v16 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v14 +; VI-NEXT: v_add_f32_e32 v17, v14, v12 +; VI-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, -11, v13 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v13 +; VI-NEXT: v_ldexp_f32 v14, v14, 11 +; VI-NEXT: s_cbranch_vccnz .LBB10_29 +; VI-NEXT: ; %bb.30: ; %Flow +; VI-NEXT: v_mov_b32_e32 v14, v16 +; VI-NEXT: .LBB10_31: ; %frem.loop_exit94 +; VI-NEXT: v_add_u32_e32 v13, vcc, -10, v13 +; VI-NEXT: v_ldexp_f32 v13, v14, v13 +; VI-NEXT: v_mul_f32_e32 v14, v13, v15 +; VI-NEXT: v_rndne_f32_e32 v14, v14 +; VI-NEXT: v_fma_f32 v13, -v14, v12, v13 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v13 +; VI-NEXT: v_add_f32_e32 v12, v13, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; VI-NEXT: v_ldexp_f32 v11, v12, v11 +; VI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: v_bfi_b32 v11, s2, v11, v9 +; VI-NEXT: .LBB10_32: ; %Flow124 +; VI-NEXT: s_movk_i32 s4, 0x7c00 +; VI-NEXT: v_cmp_lg_f16_e32 vcc, 0, v2 +; VI-NEXT: v_cmp_nge_f16_e64 s[2:3], |v0|, s4 +; VI-NEXT: s_and_b64 vcc, s[2:3], vcc +; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc +; VI-NEXT: v_cmp_lg_f16_e32 vcc, 0, v6 +; VI-NEXT: v_cmp_nge_f16_e64 s[2:3], |v5|, s4 +; VI-NEXT: s_and_b64 vcc, s[2:3], vcc +; VI-NEXT: v_cndmask_b32_sdwa v5, v2, v7, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_cmp_lg_f16_e32 vcc, 0, v3 +; VI-NEXT: v_cmp_nge_f16_e64 s[2:3], |v1|, s4 +; VI-NEXT: s_and_b64 vcc, s[2:3], vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v2, v8, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_cmp_lg_f16_e32 vcc, 0, v10 +; VI-NEXT: v_cmp_nge_f16_e64 s[0:1], |v9|, s4 +; VI-NEXT: s_and_b64 vcc, s[0:1], vcc +; VI-NEXT: v_cndmask_b32_sdwa v2, v2, v11, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; @@ -3264,73 +8297,316 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] offset:32 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX9-NEXT: v_cvt_f32_f16_e64 v6, |v2| ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; GFX9-NEXT: v_cvt_f32_f16_e32 v9, v8 -; GFX9-NEXT: v_rcp_f32_e32 v6, v6 -; GFX9-NEXT: v_rcp_f32_e32 v9, v9 -; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX9-NEXT: v_mad_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] -; GFX9-NEXT: v_mac_f32_e32 v5, v7, v6 -; GFX9-NEXT: v_mad_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] -; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX9-NEXT: v_cvt_f32_f16_e64 v5, |v0| +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5 +; GFX9-NEXT: s_cbranch_vccz .LBB10_2 +; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: v_bfi_b32 v4, s2, 0, v2 +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc +; GFX9-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-NEXT: s_branch .LBB10_8 +; GFX9-NEXT: .LBB10_2: +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: .LBB10_3: ; %frem.compute +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v9, v6 +; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v6 +; GFX9-NEXT: v_frexp_mant_f32_e32 v6, v5 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v10, v5 +; GFX9-NEXT: v_ldexp_f32 v5, v6, 1 +; GFX9-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0 +; GFX9-NEXT: v_div_scale_f32 v8, vcc, 1.0, v5, 1.0 +; GFX9-NEXT: v_ldexp_f32 v7, v4, 11 +; GFX9-NEXT: v_add_u32_e32 v4, -1, v10 +; GFX9-NEXT: v_not_b32_e32 v6, v4 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v9 +; GFX9-NEXT: v_rcp_f32_e32 v12, v11 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX9-NEXT: v_fma_f32 v13, -v11, v12, 1.0 +; GFX9-NEXT: v_fma_f32 v12, v13, v12, v12 +; GFX9-NEXT: v_mul_f32_e32 v13, v8, v12 +; GFX9-NEXT: v_fma_f32 v14, -v11, v13, v8 +; GFX9-NEXT: v_fma_f32 v13, v14, v12, v13 +; GFX9-NEXT: v_fma_f32 v8, -v11, v13, v8 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX9-NEXT: v_div_fmas_f32 v8, v8, v12, v13 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6 +; GFX9-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB10_7 +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: v_sub_u32_e32 v6, v9, v10 +; GFX9-NEXT: v_add_u32_e32 v6, 11, v6 +; GFX9-NEXT: .LBB10_5: ; %frem.loop_body +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-NEXT: v_mul_f32_e32 v7, v9, v8 +; GFX9-NEXT: v_rndne_f32_e32 v7, v7 +; GFX9-NEXT: v_fma_f32 v7, -v7, v5, v9 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; GFX9-NEXT: v_add_f32_e32 v10, v7, v5 +; GFX9-NEXT: v_add_u32_e32 v6, -11, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 11, v6 +; GFX9-NEXT: v_ldexp_f32 v7, v7, 11 +; GFX9-NEXT: s_cbranch_vccnz .LBB10_5 +; GFX9-NEXT: ; %bb.6: ; %Flow133 +; GFX9-NEXT: v_mov_b32_e32 v7, v9 +; GFX9-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX9-NEXT: v_add_u32_e32 v6, -10, v6 +; GFX9-NEXT: v_ldexp_f32 v6, v7, v6 +; GFX9-NEXT: v_mul_f32_e32 v7, v6, v8 +; GFX9-NEXT: v_rndne_f32_e32 v7, v7 +; GFX9-NEXT: v_fma_f32 v6, -v7, v5, v6 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 ; GFX9-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v7, v7, v9 -; GFX9-NEXT: v_div_fixup_f16 v5, v5, v3, v1 -; GFX9-NEXT: v_mad_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX9-NEXT: v_trunc_f16_e32 v5, v5 -; GFX9-NEXT: v_mac_f32_e32 v7, v10, v9 -; GFX9-NEXT: v_fma_f16 v5, -v5, v3, v1 -; GFX9-NEXT: v_mad_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9 -; GFX9-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-NEXT: v_div_fixup_f16 v1, v1, v8, v6 -; GFX9-NEXT: v_trunc_f16_e32 v1, v1 -; GFX9-NEXT: v_fma_f16 v1, -v1, v8, v6 -; GFX9-NEXT: v_pack_b32_f16 v1, v5, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v8, v7 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_rcp_f32_e32 v8, v8 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX9-NEXT: v_mad_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1] -; GFX9-NEXT: v_mac_f32_e32 v3, v6, v5 -; GFX9-NEXT: v_mad_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1] -; GFX9-NEXT: v_mul_f32_e32 v5, v6, v5 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v5 -; GFX9-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v6, v6, v8 -; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v0 -; GFX9-NEXT: v_mad_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX9-NEXT: v_trunc_f16_e32 v3, v3 -; GFX9-NEXT: v_mac_f32_e32 v6, v9, v8 -; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v0 -; GFX9-NEXT: v_mad_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xff800000, v0 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: v_div_fixup_f16 v0, v0, v7, v5 -; GFX9-NEXT: v_trunc_f16_e32 v0, v0 -; GFX9-NEXT: v_fma_f16 v0, -v0, v7, v5 -; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX9-NEXT: v_ldexp_f32 v4, v5, v4 +; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: v_bfi_b32 v4, s2, v4, v2 +; GFX9-NEXT: .LBB10_8: +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-NEXT: v_cvt_f32_f16_e64 v8, |v5| +; GFX9-NEXT: v_cvt_f32_f16_sdwa v7, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v8, v7 +; GFX9-NEXT: s_cbranch_vccz .LBB10_10 +; GFX9-NEXT: ; %bb.9: ; %frem.else20 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: v_bfi_b32 v6, s2, 0, v5 +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v8, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc +; GFX9-NEXT: s_cbranch_execz .LBB10_11 +; GFX9-NEXT: s_branch .LBB10_16 +; GFX9-NEXT: .LBB10_10: +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: .LBB10_11: ; %frem.compute19 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v11, v8 +; GFX9-NEXT: v_frexp_mant_f32_e32 v6, v8 +; GFX9-NEXT: v_frexp_mant_f32_e32 v8, v7 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v12, v7 +; GFX9-NEXT: v_ldexp_f32 v7, v8, 1 +; GFX9-NEXT: v_div_scale_f32 v13, s[2:3], v7, v7, 1.0 +; GFX9-NEXT: v_div_scale_f32 v10, vcc, 1.0, v7, 1.0 +; GFX9-NEXT: v_ldexp_f32 v9, v6, 11 +; GFX9-NEXT: v_add_u32_e32 v6, -1, v12 +; GFX9-NEXT: v_not_b32_e32 v8, v6 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v11 +; GFX9-NEXT: v_rcp_f32_e32 v14, v13 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX9-NEXT: v_fma_f32 v15, -v13, v14, 1.0 +; GFX9-NEXT: v_fma_f32 v14, v15, v14, v14 +; GFX9-NEXT: v_mul_f32_e32 v15, v10, v14 +; GFX9-NEXT: v_fma_f32 v16, -v13, v15, v10 +; GFX9-NEXT: v_fma_f32 v15, v16, v14, v15 +; GFX9-NEXT: v_fma_f32 v10, -v13, v15, v10 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX9-NEXT: v_div_fmas_f32 v10, v10, v14, v15 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v8 +; GFX9-NEXT: v_div_fixup_f32 v10, v10, v7, 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB10_15 +; GFX9-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX9-NEXT: v_sub_u32_e32 v8, v11, v12 +; GFX9-NEXT: v_add_u32_e32 v8, 11, v8 +; GFX9-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-NEXT: v_mul_f32_e32 v9, v11, v10 +; GFX9-NEXT: v_rndne_f32_e32 v9, v9 +; GFX9-NEXT: v_fma_f32 v9, -v9, v7, v11 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v9 +; GFX9-NEXT: v_add_f32_e32 v12, v9, v7 +; GFX9-NEXT: v_add_u32_e32 v8, -11, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 11, v8 +; GFX9-NEXT: v_ldexp_f32 v9, v9, 11 +; GFX9-NEXT: s_cbranch_vccnz .LBB10_13 +; GFX9-NEXT: ; %bb.14: ; %Flow129 +; GFX9-NEXT: v_mov_b32_e32 v9, v11 +; GFX9-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX9-NEXT: v_add_u32_e32 v8, -10, v8 +; GFX9-NEXT: v_ldexp_f32 v8, v9, v8 +; GFX9-NEXT: v_mul_f32_e32 v9, v8, v10 +; GFX9-NEXT: v_rndne_f32_e32 v9, v9 +; GFX9-NEXT: v_fma_f32 v8, -v9, v7, v8 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX9-NEXT: v_ldexp_f32 v6, v7, v6 +; GFX9-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: v_bfi_b32 v6, s2, v6, v5 +; GFX9-NEXT: .LBB10_16: +; GFX9-NEXT: v_cvt_f32_f16_e64 v9, |v3| +; GFX9-NEXT: v_cvt_f32_f16_e64 v8, |v1| +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8 +; GFX9-NEXT: s_cbranch_vccz .LBB10_18 +; GFX9-NEXT: ; %bb.17: ; %frem.else53 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: v_bfi_b32 v7, s2, 0, v3 +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v9, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v7, vcc +; GFX9-NEXT: s_cbranch_execz .LBB10_19 +; GFX9-NEXT: s_branch .LBB10_24 +; GFX9-NEXT: .LBB10_18: +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: .LBB10_19: ; %frem.compute52 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v12, v9 +; GFX9-NEXT: v_frexp_mant_f32_e32 v7, v9 +; GFX9-NEXT: v_frexp_mant_f32_e32 v9, v8 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v13, v8 +; GFX9-NEXT: v_ldexp_f32 v8, v9, 1 +; GFX9-NEXT: v_div_scale_f32 v14, s[2:3], v8, v8, 1.0 +; GFX9-NEXT: v_div_scale_f32 v11, vcc, 1.0, v8, 1.0 +; GFX9-NEXT: v_ldexp_f32 v10, v7, 11 +; GFX9-NEXT: v_add_u32_e32 v7, -1, v13 +; GFX9-NEXT: v_not_b32_e32 v9, v7 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v12 +; GFX9-NEXT: v_rcp_f32_e32 v15, v14 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX9-NEXT: v_fma_f32 v16, -v14, v15, 1.0 +; GFX9-NEXT: v_fma_f32 v15, v16, v15, v15 +; GFX9-NEXT: v_mul_f32_e32 v16, v11, v15 +; GFX9-NEXT: v_fma_f32 v17, -v14, v16, v11 +; GFX9-NEXT: v_fma_f32 v16, v17, v15, v16 +; GFX9-NEXT: v_fma_f32 v11, -v14, v16, v11 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX9-NEXT: v_div_fmas_f32 v11, v11, v15, v16 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v9 +; GFX9-NEXT: v_div_fixup_f32 v11, v11, v8, 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB10_23 +; GFX9-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX9-NEXT: v_sub_u32_e32 v9, v12, v13 +; GFX9-NEXT: v_add_u32_e32 v9, 11, v9 +; GFX9-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-NEXT: v_mul_f32_e32 v10, v12, v11 +; GFX9-NEXT: v_rndne_f32_e32 v10, v10 +; GFX9-NEXT: v_fma_f32 v10, -v10, v8, v12 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 +; GFX9-NEXT: v_add_f32_e32 v13, v10, v8 +; GFX9-NEXT: v_add_u32_e32 v9, -11, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 11, v9 +; GFX9-NEXT: v_ldexp_f32 v10, v10, 11 +; GFX9-NEXT: s_cbranch_vccnz .LBB10_21 +; GFX9-NEXT: ; %bb.22: ; %Flow125 +; GFX9-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX9-NEXT: v_add_u32_e32 v9, -10, v9 +; GFX9-NEXT: v_ldexp_f32 v9, v10, v9 +; GFX9-NEXT: v_mul_f32_e32 v10, v9, v11 +; GFX9-NEXT: v_rndne_f32_e32 v10, v10 +; GFX9-NEXT: v_fma_f32 v9, -v10, v8, v9 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, v9, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; GFX9-NEXT: v_ldexp_f32 v7, v8, v7 +; GFX9-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: v_bfi_b32 v7, s2, v7, v3 +; GFX9-NEXT: .LBB10_24: +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX9-NEXT: v_cvt_f32_f16_e64 v11, |v8| +; GFX9-NEXT: v_cvt_f32_f16_sdwa v10, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 +; GFX9-NEXT: s_cbranch_vccz .LBB10_26 +; GFX9-NEXT: ; %bb.25: ; %frem.else86 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: v_bfi_b32 v9, s2, 0, v8 +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v11, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc +; GFX9-NEXT: s_cbranch_execz .LBB10_27 +; GFX9-NEXT: s_branch .LBB10_32 +; GFX9-NEXT: .LBB10_26: +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: .LBB10_27: ; %frem.compute85 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v14, v11 +; GFX9-NEXT: v_frexp_mant_f32_e32 v9, v11 +; GFX9-NEXT: v_frexp_mant_f32_e32 v11, v10 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v15, v10 +; GFX9-NEXT: v_ldexp_f32 v10, v11, 1 +; GFX9-NEXT: v_div_scale_f32 v16, s[2:3], v10, v10, 1.0 +; GFX9-NEXT: v_div_scale_f32 v13, vcc, 1.0, v10, 1.0 +; GFX9-NEXT: v_ldexp_f32 v12, v9, 11 +; GFX9-NEXT: v_add_u32_e32 v9, -1, v15 +; GFX9-NEXT: v_not_b32_e32 v11, v9 +; GFX9-NEXT: v_add_u32_e32 v11, v11, v14 +; GFX9-NEXT: v_rcp_f32_e32 v17, v16 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX9-NEXT: v_fma_f32 v18, -v16, v17, 1.0 +; GFX9-NEXT: v_fma_f32 v17, v18, v17, v17 +; GFX9-NEXT: v_mul_f32_e32 v18, v13, v17 +; GFX9-NEXT: v_fma_f32 v19, -v16, v18, v13 +; GFX9-NEXT: v_fma_f32 v18, v19, v17, v18 +; GFX9-NEXT: v_fma_f32 v13, -v16, v18, v13 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX9-NEXT: v_div_fmas_f32 v13, v13, v17, v18 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v11 +; GFX9-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB10_31 +; GFX9-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX9-NEXT: v_sub_u32_e32 v11, v14, v15 +; GFX9-NEXT: v_add_u32_e32 v11, 11, v11 +; GFX9-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v14, v12 +; GFX9-NEXT: v_mul_f32_e32 v12, v14, v13 +; GFX9-NEXT: v_rndne_f32_e32 v12, v12 +; GFX9-NEXT: v_fma_f32 v12, -v12, v10, v14 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 +; GFX9-NEXT: v_add_f32_e32 v15, v12, v10 +; GFX9-NEXT: v_add_u32_e32 v11, -11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 11, v11 +; GFX9-NEXT: v_ldexp_f32 v12, v12, 11 +; GFX9-NEXT: s_cbranch_vccnz .LBB10_29 +; GFX9-NEXT: ; %bb.30: ; %Flow +; GFX9-NEXT: v_mov_b32_e32 v12, v14 +; GFX9-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX9-NEXT: v_add_u32_e32 v11, -10, v11 +; GFX9-NEXT: v_ldexp_f32 v11, v12, v11 +; GFX9-NEXT: v_mul_f32_e32 v12, v11, v13 +; GFX9-NEXT: v_rndne_f32_e32 v12, v12 +; GFX9-NEXT: v_fma_f32 v11, -v12, v10, v11 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, v11, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; GFX9-NEXT: v_ldexp_f32 v9, v10, v9 +; GFX9-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: v_bfi_b32 v9, s2, v9, v8 +; GFX9-NEXT: .LBB10_32: ; %Flow124 +; GFX9-NEXT: s_movk_i32 s6, 0x7c00 +; GFX9-NEXT: v_cmp_lg_f16_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_nge_f16_e64 s[2:3], |v2|, s6 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: s_and_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX9-NEXT: v_cmp_lg_f16_sdwa s[2:3], v0, v10 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cmp_nge_f16_e64 s[4:5], |v5|, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc +; GFX9-NEXT: s_and_b64 vcc, s[4:5], s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc +; GFX9-NEXT: v_cmp_lg_f16_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_nge_f16_e64 s[2:3], |v3|, s6 +; GFX9-NEXT: s_and_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_cmp_lg_f16_sdwa s[2:3], v1, v10 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cmp_nge_f16_e64 s[4:5], |v8|, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v7, vcc +; GFX9-NEXT: s_and_b64 vcc, s[4:5], s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: frem_v4f16: @@ -3341,72 +8617,317 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] offset:32 +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX10-NEXT: v_cvt_f32_f16_e64 v6, |v2| ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX10-NEXT: v_rcp_f32_e32 v7, v6 -; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7 -; GFX10-NEXT: v_mad_f32 v9, -v6, v8, v5 -; GFX10-NEXT: v_mac_f32_e32 v8, v9, v7 -; GFX10-NEXT: v_mad_f32 v5, -v6, v8, v5 -; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7 -; GFX10-NEXT: v_and_b32_e32 v5, 0xff800000, v5 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v8 -; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1 -; GFX10-NEXT: v_trunc_f16_e32 v5, v5 -; GFX10-NEXT: v_fma_f16 v5, -v5, v3, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX10-NEXT: v_rcp_f32_e32 v8, v7 -; GFX10-NEXT: v_mul_f32_e32 v9, v6, v8 -; GFX10-NEXT: v_mad_f32 v10, -v7, v9, v6 -; GFX10-NEXT: v_mac_f32_e32 v9, v10, v8 -; GFX10-NEXT: v_mad_f32 v6, -v7, v9, v6 -; GFX10-NEXT: v_mul_f32_e32 v6, v6, v8 -; GFX10-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v9 +; GFX10-NEXT: v_cvt_f32_f16_e64 v5, |v0| +; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5 +; GFX10-NEXT: s_cbranch_vccz .LBB10_2 +; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: v_bfi_b32 v4, 0x7fff, 0, v2 +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB10_3 +; GFX10-NEXT: s_branch .LBB10_8 +; GFX10-NEXT: .LBB10_2: +; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: .LBB10_3: ; %frem.compute +; GFX10-NEXT: v_frexp_mant_f32_e32 v4, v6 +; GFX10-NEXT: v_frexp_mant_f32_e32 v8, v5 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v7, v6 +; GFX10-NEXT: v_ldexp_f32 v6, v4, 11 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v4, v5 +; GFX10-NEXT: v_ldexp_f32 v5, v8, 1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v7 +; GFX10-NEXT: v_readfirstlane_b32 s3, v4 +; GFX10-NEXT: v_div_scale_f32 v9, s4, v5, v5, 1.0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, -1, v4 +; GFX10-NEXT: v_rcp_f32_e32 v10, v9 +; GFX10-NEXT: v_not_b32_e32 v8, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v8, v8, v7 +; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, 1.0, v5, 1.0 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v10 +; GFX10-NEXT: v_mul_f32_e32 v11, v7, v10 +; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v7 +; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v10 +; GFX10-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX10-NEXT: s_denorm_mode 12 +; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v8 +; GFX10-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 +; GFX10-NEXT: s_cbranch_vccnz .LBB10_7 +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s2, s2, 11 +; GFX10-NEXT: .LBB10_5: ; %frem.loop_body +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-NEXT: s_add_i32 s2, s2, -11 +; GFX10-NEXT: s_cmp_gt_i32 s2, 11 +; GFX10-NEXT: v_mul_f32_e32 v6, v9, v7 +; GFX10-NEXT: v_rndne_f32_e32 v6, v6 +; GFX10-NEXT: v_fma_f32 v6, -v6, v5, v9 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: v_add_f32_e32 v8, v6, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v6, v6, 11 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_5 +; GFX10-NEXT: ; %bb.6: ; %Flow133 +; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: v_mov_b32_e32 v6, v9 +; GFX10-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX10-NEXT: v_add_nc_u32_e32 v8, -10, v8 +; GFX10-NEXT: v_ldexp_f32 v6, v6, v8 +; GFX10-NEXT: v_mul_f32_e32 v7, v6, v7 +; GFX10-NEXT: v_rndne_f32_e32 v7, v7 +; GFX10-NEXT: v_fma_f32 v6, -v7, v5, v6 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v4, v5, v4 +; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-NEXT: v_bfi_b32 v4, 0x7fff, v4, v2 +; GFX10-NEXT: .LBB10_8: +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v7, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e64 v8, |v5| +; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v8, v7 +; GFX10-NEXT: s_cbranch_vccz .LBB10_10 +; GFX10-NEXT: ; %bb.9: ; %frem.else20 +; GFX10-NEXT: v_bfi_b32 v6, 0x7fff, 0, v5 +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v8, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB10_11 +; GFX10-NEXT: s_branch .LBB10_16 +; GFX10-NEXT: .LBB10_10: +; GFX10-NEXT: ; implicit-def: $vgpr6 +; GFX10-NEXT: .LBB10_11: ; %frem.compute19 +; GFX10-NEXT: v_frexp_mant_f32_e32 v6, v8 +; GFX10-NEXT: v_frexp_mant_f32_e32 v10, v7 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v9, v8 +; GFX10-NEXT: v_ldexp_f32 v8, v6, 11 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v6, v7 +; GFX10-NEXT: v_ldexp_f32 v7, v10, 1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v9 +; GFX10-NEXT: v_readfirstlane_b32 s3, v6 +; GFX10-NEXT: v_div_scale_f32 v11, s4, v7, v7, 1.0 +; GFX10-NEXT: v_add_nc_u32_e32 v6, -1, v6 +; GFX10-NEXT: v_rcp_f32_e32 v12, v11 +; GFX10-NEXT: v_not_b32_e32 v10, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v10, v10, v9 +; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, 1.0, v7, 1.0 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v13, -v11, v12, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v12, v13, v12 +; GFX10-NEXT: v_mul_f32_e32 v13, v9, v12 +; GFX10-NEXT: v_fma_f32 v14, -v11, v13, v9 +; GFX10-NEXT: v_fmac_f32_e32 v13, v14, v12 +; GFX10-NEXT: v_fma_f32 v9, -v11, v13, v9 +; GFX10-NEXT: s_denorm_mode 12 +; GFX10-NEXT: v_div_fmas_f32 v9, v9, v12, v13 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v10 +; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, 1.0 +; GFX10-NEXT: s_cbranch_vccnz .LBB10_15 +; GFX10-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s2, s2, 11 +; GFX10-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v11, v8 +; GFX10-NEXT: s_add_i32 s2, s2, -11 +; GFX10-NEXT: s_cmp_gt_i32 s2, 11 +; GFX10-NEXT: v_mul_f32_e32 v8, v11, v9 +; GFX10-NEXT: v_rndne_f32_e32 v8, v8 +; GFX10-NEXT: v_fma_f32 v8, -v8, v7, v11 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v8 +; GFX10-NEXT: v_add_f32_e32 v10, v8, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v8, v8, 11 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_13 +; GFX10-NEXT: ; %bb.14: ; %Flow129 +; GFX10-NEXT: v_mov_b32_e32 v10, s2 +; GFX10-NEXT: v_mov_b32_e32 v8, v11 +; GFX10-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX10-NEXT: v_add_nc_u32_e32 v10, -10, v10 +; GFX10-NEXT: v_ldexp_f32 v8, v8, v10 +; GFX10-NEXT: v_mul_f32_e32 v9, v8, v9 +; GFX10-NEXT: v_rndne_f32_e32 v9, v9 +; GFX10-NEXT: v_fma_f32 v8, -v9, v7, v8 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v8 +; GFX10-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v6, v7, v6 ; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX10-NEXT: v_div_fixup_f16 v6, v6, v3, v1 -; GFX10-NEXT: v_trunc_f16_e32 v6, v6 -; GFX10-NEXT: v_fma_f16 v1, -v6, v3, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX10-NEXT: v_pack_b32_f16 v1, v5, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX10-NEXT: v_rcp_f32_e32 v6, v5 -; GFX10-NEXT: v_mul_f32_e32 v7, v3, v6 -; GFX10-NEXT: v_mad_f32 v8, -v5, v7, v3 -; GFX10-NEXT: v_mac_f32_e32 v7, v8, v6 -; GFX10-NEXT: v_mad_f32 v3, -v5, v7, v3 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v6 -; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0 -; GFX10-NEXT: v_trunc_f16_e32 v3, v3 -; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX10-NEXT: v_rcp_f32_e32 v7, v6 -; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7 -; GFX10-NEXT: v_mad_f32 v9, -v6, v8, v5 -; GFX10-NEXT: v_mac_f32_e32 v8, v9, v7 -; GFX10-NEXT: v_mad_f32 v5, -v6, v8, v5 -; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7 -; GFX10-NEXT: v_and_b32_e32 v5, 0xff800000, v5 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v8 -; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0 -; GFX10-NEXT: v_trunc_f16_e32 v5, v5 -; GFX10-NEXT: v_fma_f16 v0, -v5, v2, v0 -; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0 +; GFX10-NEXT: v_bfi_b32 v6, 0x7fff, v6, v5 +; GFX10-NEXT: .LBB10_16: +; GFX10-NEXT: v_cvt_f32_f16_e64 v9, |v3| +; GFX10-NEXT: v_cvt_f32_f16_e64 v8, |v1| +; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8 +; GFX10-NEXT: s_cbranch_vccz .LBB10_18 +; GFX10-NEXT: ; %bb.17: ; %frem.else53 +; GFX10-NEXT: v_bfi_b32 v7, 0x7fff, 0, v3 +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v3, v7, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB10_19 +; GFX10-NEXT: s_branch .LBB10_24 +; GFX10-NEXT: .LBB10_18: +; GFX10-NEXT: ; implicit-def: $vgpr7 +; GFX10-NEXT: .LBB10_19: ; %frem.compute52 +; GFX10-NEXT: v_frexp_mant_f32_e32 v7, v9 +; GFX10-NEXT: v_frexp_mant_f32_e32 v11, v8 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v10, v9 +; GFX10-NEXT: v_ldexp_f32 v9, v7, 11 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v7, v8 +; GFX10-NEXT: v_ldexp_f32 v8, v11, 1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v10 +; GFX10-NEXT: v_readfirstlane_b32 s3, v7 +; GFX10-NEXT: v_div_scale_f32 v12, s4, v8, v8, 1.0 +; GFX10-NEXT: v_add_nc_u32_e32 v7, -1, v7 +; GFX10-NEXT: v_rcp_f32_e32 v13, v12 +; GFX10-NEXT: v_not_b32_e32 v11, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v11, v11, v10 +; GFX10-NEXT: v_div_scale_f32 v10, vcc_lo, 1.0, v8, 1.0 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v14, -v12, v13, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v13, v14, v13 +; GFX10-NEXT: v_mul_f32_e32 v14, v10, v13 +; GFX10-NEXT: v_fma_f32 v15, -v12, v14, v10 +; GFX10-NEXT: v_fmac_f32_e32 v14, v15, v13 +; GFX10-NEXT: v_fma_f32 v10, -v12, v14, v10 +; GFX10-NEXT: s_denorm_mode 12 +; GFX10-NEXT: v_div_fmas_f32 v10, v10, v13, v14 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v11 +; GFX10-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0 +; GFX10-NEXT: s_cbranch_vccnz .LBB10_23 +; GFX10-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s2, s2, 11 +; GFX10-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v12, v9 +; GFX10-NEXT: s_add_i32 s2, s2, -11 +; GFX10-NEXT: s_cmp_gt_i32 s2, 11 +; GFX10-NEXT: v_mul_f32_e32 v9, v12, v10 +; GFX10-NEXT: v_rndne_f32_e32 v9, v9 +; GFX10-NEXT: v_fma_f32 v9, -v9, v8, v12 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v9 +; GFX10-NEXT: v_add_f32_e32 v11, v9, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v9, v9, 11 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_21 +; GFX10-NEXT: ; %bb.22: ; %Flow125 +; GFX10-NEXT: v_mov_b32_e32 v11, s2 +; GFX10-NEXT: v_mov_b32_e32 v9, v12 +; GFX10-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX10-NEXT: v_add_nc_u32_e32 v11, -10, v11 +; GFX10-NEXT: v_ldexp_f32 v9, v9, v11 +; GFX10-NEXT: v_mul_f32_e32 v10, v9, v10 +; GFX10-NEXT: v_rndne_f32_e32 v10, v10 +; GFX10-NEXT: v_fma_f32 v9, -v10, v8, v9 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v9 +; GFX10-NEXT: v_add_f32_e32 v8, v9, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v7, v8, v7 +; GFX10-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX10-NEXT: v_bfi_b32 v7, 0x7fff, v7, v3 +; GFX10-NEXT: .LBB10_24: +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v10, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e64 v11, |v8| +; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v11, v10 +; GFX10-NEXT: s_cbranch_vccz .LBB10_26 +; GFX10-NEXT: ; %bb.25: ; %frem.else86 +; GFX10-NEXT: v_bfi_b32 v9, 0x7fff, 0, v8 +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v11, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB10_27 +; GFX10-NEXT: s_branch .LBB10_32 +; GFX10-NEXT: .LBB10_26: +; GFX10-NEXT: ; implicit-def: $vgpr9 +; GFX10-NEXT: .LBB10_27: ; %frem.compute85 +; GFX10-NEXT: v_frexp_mant_f32_e32 v9, v11 +; GFX10-NEXT: v_frexp_mant_f32_e32 v13, v10 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v12, v11 +; GFX10-NEXT: v_ldexp_f32 v11, v9, 11 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v9, v10 +; GFX10-NEXT: v_ldexp_f32 v10, v13, 1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v12 +; GFX10-NEXT: v_readfirstlane_b32 s3, v9 +; GFX10-NEXT: v_div_scale_f32 v14, s4, v10, v10, 1.0 +; GFX10-NEXT: v_add_nc_u32_e32 v9, -1, v9 +; GFX10-NEXT: v_rcp_f32_e32 v15, v14 +; GFX10-NEXT: v_not_b32_e32 v13, v9 +; GFX10-NEXT: v_add_nc_u32_e32 v13, v13, v12 +; GFX10-NEXT: v_div_scale_f32 v12, vcc_lo, 1.0, v10, 1.0 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v16, -v14, v15, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v15, v16, v15 +; GFX10-NEXT: v_mul_f32_e32 v16, v12, v15 +; GFX10-NEXT: v_fma_f32 v17, -v14, v16, v12 +; GFX10-NEXT: v_fmac_f32_e32 v16, v17, v15 +; GFX10-NEXT: v_fma_f32 v12, -v14, v16, v12 +; GFX10-NEXT: s_denorm_mode 12 +; GFX10-NEXT: v_div_fmas_f32 v12, v12, v15, v16 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v13 +; GFX10-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0 +; GFX10-NEXT: s_cbranch_vccnz .LBB10_31 +; GFX10-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s2, s2, 11 +; GFX10-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v14, v11 +; GFX10-NEXT: s_add_i32 s2, s2, -11 +; GFX10-NEXT: s_cmp_gt_i32 s2, 11 +; GFX10-NEXT: v_mul_f32_e32 v11, v14, v12 +; GFX10-NEXT: v_rndne_f32_e32 v11, v11 +; GFX10-NEXT: v_fma_f32 v11, -v11, v10, v14 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v11 +; GFX10-NEXT: v_add_f32_e32 v13, v11, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v11, v11, 11 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_29 +; GFX10-NEXT: ; %bb.30: ; %Flow +; GFX10-NEXT: v_mov_b32_e32 v13, s2 +; GFX10-NEXT: v_mov_b32_e32 v11, v14 +; GFX10-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX10-NEXT: v_add_nc_u32_e32 v13, -10, v13 +; GFX10-NEXT: v_ldexp_f32 v11, v11, v13 +; GFX10-NEXT: v_mul_f32_e32 v12, v11, v12 +; GFX10-NEXT: v_rndne_f32_e32 v12, v12 +; GFX10-NEXT: v_fma_f32 v11, -v12, v10, v11 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v11 +; GFX10-NEXT: v_add_f32_e32 v10, v11, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v9, v10, v9 +; GFX10-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GFX10-NEXT: v_bfi_b32 v9, 0x7fff, v9, v8 +; GFX10-NEXT: .LBB10_32: ; %Flow124 +; GFX10-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v2| +; GFX10-NEXT: v_cmp_nle_f16_e64 s3, 0x7c00, |v5| +; GFX10-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v4, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_cmp_lg_f16_sdwa s2, v0, v4 src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: s_and_b32 vcc_lo, s3, s2 +; GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v3| +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v6, vcc_lo +; GFX10-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_cmp_nle_f16_e64 s3, 0x7c00, |v8| +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX10-NEXT: v_cmp_lg_f16_sdwa s2, v1, v4 src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v7, vcc_lo +; GFX10-NEXT: s_and_b32 vcc_lo, s3, s2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v9, vcc_lo +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -3415,103 +8936,412 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v5, s[2:3] -; GFX11-TRUE16-NEXT: global_load_b64 v[3:4], v5, s[4:5] offset:32 +; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v2, s[4:5] offset:32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v6, |v0.l| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v6, v6 -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v9, v8.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v9, v9 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v5, |v2.l| +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, 0, v0 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.l, v4.l, vcc_lo +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_3 +; GFX11-TRUE16-NEXT: s_branch .LBB10_8 +; GFX11-TRUE16-NEXT: .LBB10_2: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: .LBB10_3: ; %frem.compute +; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v4, v6 +; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v8, v5 +; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_ldexp_f32 v6, v4, 11 +; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, v5 +; GFX11-TRUE16-NEXT: v_ldexp_f32 v5, v8, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_div_scale_f32 v9, null, v5, v5, 1.0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, -1, v4 +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v10, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v8, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v7 +; GFX11-TRUE16-NEXT: v_div_scale_f32 v7, vcc_lo, 1.0, v5, 1.0 +; GFX11-TRUE16-NEXT: s_denorm_mode 15 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v10, v11, v10 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v11, v7, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v12, -v9, v11, v7 +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v11, v12, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX11-TRUE16-NEXT: s_denorm_mode 12 +; GFX11-TRUE16-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_7 +; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 +; GFX11-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v6 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, -11 +; GFX11-TRUE16-NEXT: s_cmp_gt_i32 s2, 11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v6, v9, v7 +; GFX11-TRUE16-NEXT: v_rndne_f32_e32 v6, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v6, -v6, v5, v9 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_ldexp_f32 v6, v6, 11 +; GFX11-TRUE16-NEXT: s_cbranch_scc1 .LBB10_5 +; GFX11-TRUE16-NEXT: ; %bb.6: ; %Flow133 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v9 +; GFX11-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, -10, v8 +; GFX11-TRUE16-NEXT: v_ldexp_f32 v6, v6, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v7, v6, v7 +; GFX11-TRUE16-NEXT: v_rndne_f32_e32 v7, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fma_f32 v6, -v7, v5, v6 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_ldexp_f32 v4, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v4.l, v4 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v4, v0 +; GFX11-TRUE16-NEXT: .LBB10_8: +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v6.l -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v7, v7, v9 -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v10, -v4, v7, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v9, |v5.l| +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v8, |v6.l| +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_10 +; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, 0, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.l, v7.l, vcc_lo +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_11 +; GFX11-TRUE16-NEXT: s_branch .LBB10_16 +; GFX11-TRUE16-NEXT: .LBB10_10: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7 +; GFX11-TRUE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v7, v9 +; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v11, v8 +; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v10, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_ldexp_f32 v9, v7, 11 +; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v7, v8 +; GFX11-TRUE16-NEXT: v_ldexp_f32 v8, v11, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_div_scale_f32 v12, null, v8, v8, 1.0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, -1, v7 +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v13, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v10 +; GFX11-TRUE16-NEXT: v_div_scale_f32 v10, vcc_lo, 1.0, v8, 1.0 +; GFX11-TRUE16-NEXT: s_denorm_mode 15 +; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-TRUE16-NEXT: v_fma_f32 v14, -v12, v13, 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v13, v14, v13 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v14, v10, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v15, -v12, v14, v10 +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v14, v15, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v10, -v12, v14, v10 +; GFX11-TRUE16-NEXT: s_denorm_mode 12 +; GFX11-TRUE16-NEXT: v_div_fmas_f32 v10, v10, v13, v14 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_15 +; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 +; GFX11-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, v9 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, -11 +; GFX11-TRUE16-NEXT: s_cmp_gt_i32 s2, 11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v9, v12, v10 +; GFX11-TRUE16-NEXT: v_rndne_f32_e32 v9, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v9, -v9, v8, v12 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, v9, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_ldexp_f32 v9, v9, 11 +; GFX11-TRUE16-NEXT: s_cbranch_scc1 .LBB10_13 +; GFX11-TRUE16-NEXT: ; %bb.14: ; %Flow129 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v12 +; GFX11-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit28 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v7, v10, v9 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v2, -v4, v7, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v9 -; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, -10, v11 +; GFX11-TRUE16-NEXT: v_ldexp_f32 v9, v9, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v10, v9, v10 +; GFX11-TRUE16-NEXT: v_rndne_f32_e32 v10, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v9, -v10, v8, v9 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, v9, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_ldexp_f32 v7, v8, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v7.l, v7 +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v7, v5 +; GFX11-TRUE16-NEXT: .LBB10_16: +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v10, |v1.l| +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v9, |v3.l| +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v10, v9 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_18 +; GFX11-TRUE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0x7fff, 0, v1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v10, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v1.l, v8.l, vcc_lo +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_19 +; GFX11-TRUE16-NEXT: s_branch .LBB10_24 +; GFX11-TRUE16-NEXT: .LBB10_18: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8 +; GFX11-TRUE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v8, v10 +; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v12, v9 +; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v11, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_ldexp_f32 v10, v8, 11 +; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v8, v9 +; GFX11-TRUE16-NEXT: v_ldexp_f32 v9, v12, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v11 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_div_scale_f32 v13, null, v9, v9, 1.0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, -1, v8 +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v14, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v12, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v12, v11 +; GFX11-TRUE16-NEXT: v_div_scale_f32 v11, vcc_lo, 1.0, v9, 1.0 +; GFX11-TRUE16-NEXT: s_denorm_mode 15 +; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-TRUE16-NEXT: v_fma_f32 v15, -v13, v14, 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v14, v15, v14 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v15, v11, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v16, -v13, v15, v11 +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v15, v16, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v11, -v13, v15, v11 +; GFX11-TRUE16-NEXT: s_denorm_mode 12 +; GFX11-TRUE16-NEXT: v_div_fmas_f32 v11, v11, v14, v15 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_23 +; GFX11-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 +; GFX11-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v10 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, -11 +; GFX11-TRUE16-NEXT: s_cmp_gt_i32 s2, 11 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v10, v13, v11 +; GFX11-TRUE16-NEXT: v_rndne_f32_e32 v10, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v10, -v10, v9, v13 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, v10, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_ldexp_f32 v10, v10, 11 +; GFX11-TRUE16-NEXT: s_cbranch_scc1 .LBB10_21 +; GFX11-TRUE16-NEXT: ; %bb.22: ; %Flow125 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, s2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, v13 +; GFX11-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit61 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2 -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v8.l, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, -10, v12 +; GFX11-TRUE16-NEXT: v_ldexp_f32 v10, v10, v12 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h -; GFX11-TRUE16-NEXT: v_fma_f16 v0.h, -v0.h, v8.l, v6.l -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v8, v7.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v0.h -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l -; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v8, v8 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v11, v10, v11 +; GFX11-TRUE16-NEXT: v_rndne_f32_e32 v11, v11 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v1 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fma_f32 v10, -v11, v9, v10 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, v10, v9 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v1 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_ldexp_f32 v8, v9, v8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v6, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v8.l, v8 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0x7fff, v8, v1 +; GFX11-TRUE16-NEXT: .LBB10_24: +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v6, v6, v8 -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v9, -v3, v6, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v13, |v9.l| +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v12, |v10.l| +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v13, v12 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_26 +; GFX11-TRUE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0x7fff, 0, v9 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v13, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v9.l, v11.l, vcc_lo +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_27 +; GFX11-TRUE16-NEXT: s_branch .LBB10_32 +; GFX11-TRUE16-NEXT: .LBB10_26: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11 +; GFX11-TRUE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v11, v13 +; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v15, v12 +; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v14, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_ldexp_f32 v13, v11, 11 +; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v11, v12 +; GFX11-TRUE16-NEXT: v_ldexp_f32 v12, v15, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v14 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_div_scale_f32 v16, null, v12, v12, 1.0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, -1, v11 +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v17, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v15, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v14 +; GFX11-TRUE16-NEXT: v_div_scale_f32 v14, vcc_lo, 1.0, v12, 1.0 +; GFX11-TRUE16-NEXT: s_denorm_mode 15 +; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-TRUE16-NEXT: v_fma_f32 v18, -v16, v17, 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v17, v18, v17 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v18, v14, v17 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v19, -v16, v18, v14 +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v18, v19, v17 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v14, -v16, v18, v14 +; GFX11-TRUE16-NEXT: s_denorm_mode 12 +; GFX11-TRUE16-NEXT: v_div_fmas_f32 v14, v14, v17, v18 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_31 +; GFX11-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 +; GFX11-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, v13 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, -11 +; GFX11-TRUE16-NEXT: s_cmp_gt_i32 s2, 11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v13, v16, v14 +; GFX11-TRUE16-NEXT: v_rndne_f32_e32 v13, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_f32 v13, -v13, v12, v16 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v13 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, v13, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_ldexp_f32 v13, v13, 11 +; GFX11-TRUE16-NEXT: s_cbranch_scc1 .LBB10_29 +; GFX11-TRUE16-NEXT: ; %bb.30: ; %Flow +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v16 +; GFX11-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit94 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v6, v9, v8 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v1, -v3, v6, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, -10, v15 +; GFX11-TRUE16-NEXT: v_ldexp_f32 v13, v13, v15 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v1, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v14, v13, v14 +; GFX11-TRUE16-NEXT: v_rndne_f32_e32 v14, v14 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-TRUE16-NEXT: v_fma_f32 v13, -v14, v12, v13 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v13 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, v13, v12 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v7.l, v4.l -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_ldexp_f32 v11, v12, v11 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fma_f16 v0.h, -v0.h, v7.l, v4.l -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v0.h -; GFX11-TRUE16-NEXT: global_store_b64 v5, v[1:2], s[0:1] +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v11.l, v11 +; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0x7fff, v11, v9 +; GFX11-TRUE16-NEXT: .LBB10_32: ; %Flow124 +; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v2.l +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v0.l| +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v6.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v4.l, s2 +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v5.l| +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v3.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v7.l, s2 +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v1.l| +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v10.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v8.l, s2 +; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v9.l| +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, 0x7e00, v11.l, s2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX11-TRUE16-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: frem_v4f16: @@ -3519,556 +9349,2264 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v4, s[2:3] -; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 +; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v2, s[4:5] offset:32 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v6, |v0| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v6, v6 -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v9, v8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v9, v9 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v5, |v2| +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX11-FAKE16-NEXT: v_bfi_b32 v4, 0x7fff, 0, v0 +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc_lo +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB10_3 +; GFX11-FAKE16-NEXT: s_branch .LBB10_8 +; GFX11-FAKE16-NEXT: .LBB10_2: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-FAKE16-NEXT: .LBB10_3: ; %frem.compute +; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v4, v6 +; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v8, v5 +; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_ldexp_f32 v6, v4, 11 +; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, v5 +; GFX11-FAKE16-NEXT: v_ldexp_f32 v5, v8, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s2, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_div_scale_f32 v9, null, v5, v5, 1.0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, -1, v4 +; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v10, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v8, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v7 +; GFX11-FAKE16-NEXT: v_div_scale_f32 v7, vcc_lo, 1.0, v5, 1.0 +; GFX11-FAKE16-NEXT: s_denorm_mode 15 ; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] +; GFX11-FAKE16-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v10, v11, v10 +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v11, v7, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v12, -v9, v11, v7 +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v11, v12, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX11-FAKE16-NEXT: s_denorm_mode 12 +; GFX11-FAKE16-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_7 +; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 +; GFX11-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v6 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, -11 +; GFX11-FAKE16-NEXT: s_cmp_gt_i32 s2, 11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v6, v9, v7 +; GFX11-FAKE16-NEXT: v_rndne_f32_e32 v6, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v6, -v6, v5, v9 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v5, v7, v6 -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_ldexp_f32 v6, v6, 11 +; GFX11-FAKE16-NEXT: s_cbranch_scc1 .LBB10_5 +; GFX11-FAKE16-NEXT: ; %bb.6: ; %Flow133 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, s2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v9 +; GFX11-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, -10, v8 +; GFX11-FAKE16-NEXT: v_ldexp_f32 v6, v6, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v7, v6, v7 +; GFX11-FAKE16-NEXT: v_rndne_f32_e32 v7, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_fma_f32 v6, -v7, v5, v6 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v6 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX11-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v3, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v7, v7, v9 -; GFX11-FAKE16-NEXT: v_trunc_f16_e32 v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_ldexp_f32 v4, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX11-FAKE16-NEXT: v_bfi_b32 v4, 0x7fff, v4, v0 +; GFX11-FAKE16-NEXT: .LBB10_8: +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-FAKE16-NEXT: v_fma_f16 v5, -v5, v3, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v7, v10, v9 -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v9, |v5| +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v8, |v6| +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_10 +; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX11-FAKE16-NEXT: v_bfi_b32 v7, 0x7fff, 0, v5 +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v5, v7, vcc_lo +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB10_11 +; GFX11-FAKE16-NEXT: s_branch .LBB10_16 +; GFX11-FAKE16-NEXT: .LBB10_10: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 +; GFX11-FAKE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v7, v9 +; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v11, v8 +; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v10, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_ldexp_f32 v9, v7, 11 +; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v7, v8 +; GFX11-FAKE16-NEXT: v_ldexp_f32 v8, v11, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s2, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_div_scale_f32 v12, null, v8, v8, 1.0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, -1, v7 +; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v13, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v11, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v10 +; GFX11-FAKE16-NEXT: v_div_scale_f32 v10, vcc_lo, 1.0, v8, 1.0 +; GFX11-FAKE16-NEXT: s_denorm_mode 15 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FAKE16-NEXT: v_fma_f32 v14, -v12, v13, 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v13, v14, v13 +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v14, v10, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v15, -v12, v14, v10 +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v14, v15, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v10, -v12, v14, v10 +; GFX11-FAKE16-NEXT: s_denorm_mode 12 +; GFX11-FAKE16-NEXT: v_div_fmas_f32 v10, v10, v13, v14 +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_15 +; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 +; GFX11-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v9 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, -11 +; GFX11-FAKE16-NEXT: s_cmp_gt_i32 s2, 11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v9, v12, v10 +; GFX11-FAKE16-NEXT: v_rndne_f32_e32 v9, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v9, -v9, v8, v12 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, v9, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_ldexp_f32 v9, v9, 11 +; GFX11-FAKE16-NEXT: s_cbranch_scc1 .LBB10_13 +; GFX11-FAKE16-NEXT: ; %bb.14: ; %Flow129 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v12 +; GFX11-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit28 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v1, v1, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, -10, v11 +; GFX11-FAKE16-NEXT: v_ldexp_f32 v9, v9, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v10, v9, v10 +; GFX11-FAKE16-NEXT: v_rndne_f32_e32 v10, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v9, -v10, v8, v9 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, v9, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_div_fixup_f16 v1, v1, v8, v6 -; GFX11-FAKE16-NEXT: v_trunc_f16_e32 v1, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_fma_f16 v1, -v1, v8, v6 -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v8, v7 -; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v5, v1 -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v8, v8 -; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_ldexp_f32 v7, v8, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX11-FAKE16-NEXT: v_bfi_b32 v7, 0x7fff, v7, v5 +; GFX11-FAKE16-NEXT: .LBB10_16: +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v10, |v1| +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v9, |v3| +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v10, v9 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_18 +; GFX11-FAKE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX11-FAKE16-NEXT: v_bfi_b32 v8, 0x7fff, 0, v1 +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v10, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v1, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB10_19 +; GFX11-FAKE16-NEXT: s_branch .LBB10_24 +; GFX11-FAKE16-NEXT: .LBB10_18: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8 +; GFX11-FAKE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v8, v10 +; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v12, v9 +; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v11, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_ldexp_f32 v10, v8, 11 +; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v8, v9 +; GFX11-FAKE16-NEXT: v_ldexp_f32 v9, v12, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s2, v11 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_div_scale_f32 v13, null, v9, v9, 1.0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, -1, v8 +; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v14, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v12, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v11 +; GFX11-FAKE16-NEXT: v_div_scale_f32 v11, vcc_lo, 1.0, v9, 1.0 +; GFX11-FAKE16-NEXT: s_denorm_mode 15 ; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX11-FAKE16-NEXT: v_fma_f32 v15, -v13, v14, 1.0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1] -; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v3, v6, v5 +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v14, v15, v14 +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v15, v11, v14 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1] -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v5, v6, v5 +; GFX11-FAKE16-NEXT: v_fma_f32 v16, -v13, v15, v11 +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v15, v16, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v11, -v13, v15, v11 +; GFX11-FAKE16-NEXT: s_denorm_mode 12 +; GFX11-FAKE16-NEXT: v_div_fmas_f32 v11, v11, v14, v15 +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_23 +; GFX11-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 +; GFX11-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, v10 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, -11 +; GFX11-FAKE16-NEXT: s_cmp_gt_i32 s2, 11 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v0 -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_fma_f16 v3, -v3, v2, v0 -; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v6, v9, v8 +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v10, v13, v11 +; GFX11-FAKE16-NEXT: v_rndne_f32_e32 v10, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v10, -v10, v9, v13 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, v10, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_ldexp_f32 v10, v10, 11 +; GFX11-FAKE16-NEXT: s_cbranch_scc1 .LBB10_21 +; GFX11-FAKE16-NEXT: ; %bb.22: ; %Flow125 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, s2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v13 +; GFX11-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, -10, v12 +; GFX11-FAKE16-NEXT: v_ldexp_f32 v10, v10, v12 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v0, v0, v8 +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v11, v10, v11 +; GFX11-FAKE16-NEXT: v_rndne_f32_e32 v11, v11 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v6 +; GFX11-FAKE16-NEXT: v_fma_f32 v10, -v11, v9, v10 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, v10, v9 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-FAKE16-NEXT: v_div_fixup_f16 v0, v0, v7, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_ldexp_f32 v8, v9, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_trunc_f16_e32 v0, v0 -; GFX11-FAKE16-NEXT: v_fma_f16 v0, -v0, v7, v5 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX11-FAKE16-NEXT: v_bfi_b32 v8, 0x7fff, v8, v1 +; GFX11-FAKE16-NEXT: .LBB10_24: +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v13, |v9| +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v12, |v10| ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v3, v0 -; GFX11-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v13, v12 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_26 +; GFX11-FAKE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX11-FAKE16-NEXT: v_bfi_b32 v11, 0x7fff, 0, v9 +; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v13, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v9, v11, vcc_lo +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB10_27 +; GFX11-FAKE16-NEXT: s_branch .LBB10_32 +; GFX11-FAKE16-NEXT: .LBB10_26: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11 +; GFX11-FAKE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v11, v13 +; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v15, v12 +; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v14, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_ldexp_f32 v13, v11, 11 +; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v11, v12 +; GFX11-FAKE16-NEXT: v_ldexp_f32 v12, v15, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s2, v14 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_div_scale_f32 v16, null, v12, v12, 1.0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, -1, v11 +; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v17, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v15, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v14 +; GFX11-FAKE16-NEXT: v_div_scale_f32 v14, vcc_lo, 1.0, v12, 1.0 +; GFX11-FAKE16-NEXT: s_denorm_mode 15 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FAKE16-NEXT: v_fma_f32 v18, -v16, v17, 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v17, v18, v17 +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v18, v14, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v19, -v16, v18, v14 +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v18, v19, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v14, -v16, v18, v14 +; GFX11-FAKE16-NEXT: s_denorm_mode 12 +; GFX11-FAKE16-NEXT: v_div_fmas_f32 v14, v14, v17, v18 +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_31 +; GFX11-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 +; GFX11-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v13 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, -11 +; GFX11-FAKE16-NEXT: s_cmp_gt_i32 s2, 11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v13, v16, v14 +; GFX11-FAKE16-NEXT: v_rndne_f32_e32 v13, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v13, -v13, v12, v16 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v13 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, v13, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_ldexp_f32 v13, v13, 11 +; GFX11-FAKE16-NEXT: s_cbranch_scc1 .LBB10_29 +; GFX11-FAKE16-NEXT: ; %bb.30: ; %Flow +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, v16 +; GFX11-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, -10, v15 +; GFX11-FAKE16-NEXT: v_ldexp_f32 v13, v13, v15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v14, v13, v14 +; GFX11-FAKE16-NEXT: v_rndne_f32_e32 v14, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_f32 v13, -v14, v12, v13 +; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v13 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, v13, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_ldexp_f32 v11, v12, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GFX11-FAKE16-NEXT: v_bfi_b32 v11, 0x7fff, v11, v9 +; GFX11-FAKE16-NEXT: .LBB10_32: ; %Flow124 +; GFX11-FAKE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v0| +; GFX11-FAKE16-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v5| +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v1| +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v9| +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX11-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX1150-TRUE16-LABEL: frem_v4f16: ; GFX1150-TRUE16: ; %bb.0: ; GFX1150-TRUE16-NEXT: s_clause 0x1 ; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v6, 0 +; GFX1150-TRUE16-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-TRUE16-NEXT: s_clause 0x1 -; GFX1150-TRUE16-NEXT: global_load_b64 v[2:3], v6, s[2:3] -; GFX1150-TRUE16-NEXT: global_load_b64 v[4:5], v6, s[4:5] offset:32 -; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h +; GFX1150-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v4.h -; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v1, v1 -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX1150-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[8:9] offset:32 +; GFX1150-TRUE16-NEXT: s_and_b32 s3, s5, 0x7fff +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: s_cvt_f32_f16 s8, s3 +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1150-TRUE16-NEXT: s_and_b32 s6, s4, 0x7fff +; GFX1150-TRUE16-NEXT: s_cvt_f32_f16 s6, s6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6 +; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2 +; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s8, s6 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s5 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s10, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.l, s5, v0.l, s10 +; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB10_3 +; GFX1150-TRUE16-NEXT: s_branch .LBB10_8 +; GFX1150-TRUE16-NEXT: .LBB10_2: +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr0 +; GFX1150-TRUE16-NEXT: .LBB10_3: ; %frem.compute +; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s6 +; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s8 +; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v1, v1, 1 +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v0, 11 +; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v0, s6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s8, v3 +; GFX1150-TRUE16-NEXT: v_div_scale_f32 v5, null, v1, v1, 1.0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v5 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_not_b32_e32 v4, v0 +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v3 +; GFX1150-TRUE16-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 +; GFX1150-TRUE16-NEXT: s_denorm_mode 15 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v7, v3, v6 +; GFX1150-TRUE16-NEXT: v_fma_f32 v8, -v5, v7, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v7, v8, v6 +; GFX1150-TRUE16-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX1150-TRUE16-NEXT: s_denorm_mode 12 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-TRUE16-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 +; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_7 +; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-TRUE16-NEXT: s_sub_i32 s6, s8, s6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: s_add_i32 s6, s6, 11 +; GFX1150-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, v2 +; GFX1150-TRUE16-NEXT: s_add_i32 s6, s6, -11 +; GFX1150-TRUE16-NEXT: s_cmp_gt_i32 s6, 11 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v1 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v2, v5, v3 +; GFX1150-TRUE16-NEXT: v_rndne_f32_e32 v2, v2 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v1, v7, v1 -; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX1150-TRUE16-NEXT: v_fma_f32 v2, v2, v1, v5 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v4, v2, v1 +; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v2, 11 +; GFX1150-TRUE16-NEXT: s_cbranch_scc1 .LBB10_5 +; GFX1150-TRUE16-NEXT: ; %bb.6: ; %Flow133 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, s6 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, v5 +; GFX1150-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX1150-TRUE16-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v2, v3, v1 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v8.l, v1.l -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v4.l -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v1, v1 -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v7.l, v0.l, v8.l -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s5 +; GFX1150-TRUE16-NEXT: .LBB10_8: +; GFX1150-TRUE16-NEXT: s_lshr_b32 s8, s5, 16 +; GFX1150-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX1150-TRUE16-NEXT: s_and_b32 s5, s8, 0x7fff +; GFX1150-TRUE16-NEXT: s_and_b32 s9, s6, 0x7fff +; GFX1150-TRUE16-NEXT: s_cvt_f32_f16 s10, s5 +; GFX1150-TRUE16-NEXT: s_cvt_f32_f16 s9, s9 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 +; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_10 +; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s8 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s12, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v1.l, s8, v1.l, s12 +; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB10_11 +; GFX1150-TRUE16-NEXT: s_branch .LBB10_16 +; GFX1150-TRUE16-NEXT: .LBB10_10: +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX1150-TRUE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s9 +; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s10 +; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v2, 1 +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v1, 11 +; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v1, s9 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s10, v4 +; GFX1150-TRUE16-NEXT: v_div_scale_f32 v6, null, v2, v2, 1.0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v7, v6 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v8, -v4, v0, v2 op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v8, v1 +; GFX1150-TRUE16-NEXT: v_not_b32_e32 v5, v1 +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v4 +; GFX1150-TRUE16-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0 +; GFX1150-TRUE16-NEXT: s_denorm_mode 15 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_f32 v8, -v6, v7, 1.0 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v7, v8, v7 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v8, -v4, v0, v2 op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v1, v8, v1 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v8, v4, v7 +; GFX1150-TRUE16-NEXT: v_fma_f32 v9, -v6, v8, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v5.h -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v1, v1 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v8, v9, v7 +; GFX1150-TRUE16-NEXT: v_fma_f32 v4, -v6, v8, v4 +; GFX1150-TRUE16-NEXT: s_denorm_mode 12 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-TRUE16-NEXT: v_div_fmas_f32 v4, v4, v7, v8 +; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 +; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_15 +; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1150-TRUE16-NEXT: s_sub_i32 s9, s10, s9 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: s_add_i32 s9, s9, 11 +; GFX1150-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX1150-TRUE16-NEXT: s_add_i32 s9, s9, -11 +; GFX1150-TRUE16-NEXT: s_cmp_gt_i32 s9, 11 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v0.h -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v6, v4 +; GFX1150-TRUE16-NEXT: v_rndne_f32_e32 v3, v3 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v4.l, v2.l +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1150-TRUE16-NEXT: v_fma_f32 v3, v3, v2, v6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v5, v3, v2 +; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v7.l -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.h -; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_2) -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v3, 11 +; GFX1150-TRUE16-NEXT: s_cbranch_scc1 .LBB10_13 +; GFX1150-TRUE16-NEXT: ; %bb.14: ; %Flow129 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, s9 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v3, v6 +; GFX1150-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v4, -v5, v0, v3 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v4, v1 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX1150-TRUE16-NEXT: v_rndne_f32_e32 v4, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v4, -v5, v0, v3 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v4, v2 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, s8 +; GFX1150-TRUE16-NEXT: .LBB10_16: +; GFX1150-TRUE16-NEXT: s_and_b32 s8, s7, 0x7fff +; GFX1150-TRUE16-NEXT: s_and_b32 s9, s2, 0x7fff +; GFX1150-TRUE16-NEXT: s_cvt_f32_f16 s10, s8 +; GFX1150-TRUE16-NEXT: s_cvt_f32_f16 s9, s9 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 +; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_18 +; GFX1150-TRUE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, s7 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s12, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v2.l, s7, v2.l, s12 +; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB10_19 +; GFX1150-TRUE16-NEXT: s_branch .LBB10_24 +; GFX1150-TRUE16-NEXT: .LBB10_18: +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX1150-TRUE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s9 +; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s10 +; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v3, 1 +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v4, v2, 11 +; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v2, s9 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s10, v5 +; GFX1150-TRUE16-NEXT: v_div_scale_f32 v7, null, v3, v3, 1.0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s9, v2 +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v8, v7 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_not_b32_e32 v6, v2 +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX1150-TRUE16-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 +; GFX1150-TRUE16-NEXT: s_denorm_mode 15 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v8, v9, v8 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v9, v5, v8 +; GFX1150-TRUE16-NEXT: v_fma_f32 v10, -v7, v9, v5 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX1150-TRUE16-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX1150-TRUE16-NEXT: s_denorm_mode 12 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v7.l, v1.l -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v5.l -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 +; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_23 +; GFX1150-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX1150-TRUE16-NEXT: s_sub_i32 s9, s10, s9 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: s_add_i32 s9, s9, 11 +; GFX1150-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v7, v4 +; GFX1150-TRUE16-NEXT: s_add_i32 s9, s9, -11 +; GFX1150-TRUE16-NEXT: s_cmp_gt_i32 s9, 11 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v7, v5 +; GFX1150-TRUE16-NEXT: v_rndne_f32_e32 v4, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1150-TRUE16-NEXT: v_fma_f32 v4, v4, v3, v7 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v4, v4, 11 +; GFX1150-TRUE16-NEXT: s_cbranch_scc1 .LBB10_21 +; GFX1150-TRUE16-NEXT: ; %bb.22: ; %Flow125 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v6, s9 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, v7 +; GFX1150-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit61 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v1, v1 -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX1150-TRUE16-NEXT: v_rndne_f32_e32 v5, v5 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v4, v5, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v4.l, v0.l, v7.l -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.l -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, s7 +; GFX1150-TRUE16-NEXT: .LBB10_24: +; GFX1150-TRUE16-NEXT: s_lshr_b32 s10, s7, 16 +; GFX1150-TRUE16-NEXT: s_lshr_b32 s9, s2, 16 +; GFX1150-TRUE16-NEXT: s_and_b32 s7, s10, 0x7fff +; GFX1150-TRUE16-NEXT: s_and_b32 s11, s9, 0x7fff +; GFX1150-TRUE16-NEXT: s_cvt_f32_f16 s12, s7 +; GFX1150-TRUE16-NEXT: s_cvt_f32_f16 s11, s11 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s12, s11 +; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_26 +; GFX1150-TRUE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s12, s11 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, 0, s10 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v3.l, s10, v3.l, s14 +; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB10_27 +; GFX1150-TRUE16-NEXT: s_branch .LBB10_32 +; GFX1150-TRUE16-NEXT: .LBB10_26: +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX1150-TRUE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v4, s11 +; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s12 +; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v4, v4, 1 +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v5, v3, 11 +; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s11 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s12, v6 +; GFX1150-TRUE16-NEXT: v_div_scale_f32 v8, null, v4, v4, 1.0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-TRUE16-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v9, v8 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_not_b32_e32 v7, v3 +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v6 +; GFX1150-TRUE16-NEXT: v_div_scale_f32 v6, vcc_lo, 1.0, v4, 1.0 +; GFX1150-TRUE16-NEXT: s_denorm_mode 15 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_f32 v10, -v8, v9, 1.0 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v9, v10, v9 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v10, v6, v9 +; GFX1150-TRUE16-NEXT: v_fma_f32 v11, -v8, v10, v6 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v0, v3 op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v1 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v10, v11, v9 +; GFX1150-TRUE16-NEXT: v_fma_f32 v6, -v8, v10, v6 +; GFX1150-TRUE16-NEXT: s_denorm_mode 12 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-TRUE16-NEXT: v_div_fmas_f32 v6, v6, v9, v10 +; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7 +; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_31 +; GFX1150-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX1150-TRUE16-NEXT: s_sub_i32 s11, s12, s11 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: s_add_i32 s11, s11, 11 +; GFX1150-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v8, v5 +; GFX1150-TRUE16-NEXT: s_add_i32 s11, s11, -11 +; GFX1150-TRUE16-NEXT: s_cmp_gt_i32 s11, 11 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v0, v3 op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v1, v7, v1 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v5, v8, v6 +; GFX1150-TRUE16-NEXT: v_rndne_f32_e32 v5, v5 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1150-TRUE16-NEXT: v_fma_f32 v5, v5, v4, v8 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v5.l, v0.h +; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v7, v5, v4 +; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v5, v5, 11 +; GFX1150-TRUE16-NEXT: s_cbranch_scc1 .LBB10_29 +; GFX1150-TRUE16-NEXT: ; %bb.30: ; %Flow +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v7, s11 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, v8 +; GFX1150-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v7, -10, v7 +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v5, v5, v7 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v6, v5, v6 +; GFX1150-TRUE16-NEXT: v_rndne_f32_e32 v6, v6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v5.l, v3.l -; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v3, v0.l, v4.l -; GFX1150-TRUE16-NEXT: global_store_b64 v6, v[2:3], s[0:1] +; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v4, v3 +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.l, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v3, s10 +; GFX1150-TRUE16-NEXT: .LBB10_32: ; %Flow124 +; GFX1150-TRUE16-NEXT: s_cmp_lg_f16 s4, 0 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s3, 0x7c00 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: s_and_b32 s3, s3, s4 +; GFX1150-TRUE16-NEXT: s_cmp_lg_f16 s6, 0 +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s3 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s5, 0x7c00 +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: s_and_b32 s3, s4, s3 +; GFX1150-TRUE16-NEXT: s_cmp_lg_f16 s2, 0 +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v4.l, 0x7e00, v1.l, s3 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s8, 0x7c00 +; GFX1150-TRUE16-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: s_and_b32 s2, s3, s2 +; GFX1150-TRUE16-NEXT: s_cmp_lg_f16 s9, 0 +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, s2 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s7, 0x7c00 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1150-TRUE16-NEXT: s_and_b32 s2, s3, s2 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v3.l, 0x7e00, v3.l, s2 +; GFX1150-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX1150-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1150-TRUE16-NEXT: s_endpgm ; ; GFX1150-FAKE16-LABEL: frem_v4f16: ; GFX1150-FAKE16: ; %bb.0: ; GFX1150-FAKE16-NEXT: s_clause 0x1 ; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX1150-FAKE16-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-FAKE16-NEXT: s_clause 0x1 -; GFX1150-FAKE16-NEXT: global_load_b64 v[0:1], v4, s[2:3] -; GFX1150-FAKE16-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 -; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX1150-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v8, v7 -; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v8, v8 -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v8 +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX1150-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[8:9] offset:32 +; GFX1150-FAKE16-NEXT: s_and_b32 s3, s5, 0x7fff +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1150-FAKE16-NEXT: s_cvt_f32_f16 s8, s3 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1150-FAKE16-NEXT: s_and_b32 s6, s4, 0x7fff +; GFX1150-FAKE16-NEXT: s_cvt_f32_f16 s6, s6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s8, s6 +; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_2 +; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s8, s6 +; GFX1150-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s5 +; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo +; GFX1150-FAKE16-NEXT: s_cbranch_execz .LBB10_3 +; GFX1150-FAKE16-NEXT: s_branch .LBB10_8 +; GFX1150-FAKE16-NEXT: .LBB10_2: +; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr0 +; GFX1150-FAKE16-NEXT: .LBB10_3: ; %frem.compute +; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s6 +; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s8 +; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v1, v1, 1 +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v0, 11 +; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v0, s6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s8, v3 +; GFX1150-FAKE16-NEXT: v_div_scale_f32 v5, null, v1, v1, 1.0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v6, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_not_b32_e32 v4, v0 +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v3 +; GFX1150-FAKE16-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 +; GFX1150-FAKE16-NEXT: s_denorm_mode 15 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v7, v3, v6 +; GFX1150-FAKE16-NEXT: v_fma_f32 v8, -v5, v7, v3 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v6, v9, v8 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v7, v8, v6 +; GFX1150-FAKE16-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX1150-FAKE16-NEXT: s_denorm_mode 12 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 +; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_7 +; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-FAKE16-NEXT: s_sub_i32 s6, s8, s6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-FAKE16-NEXT: s_add_i32 s6, s6, 11 +; GFX1150-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, v2 +; GFX1150-FAKE16-NEXT: s_add_i32 s6, s6, -11 +; GFX1150-FAKE16-NEXT: s_cmp_gt_i32 s6, 11 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v2, v5, v3 +; GFX1150-FAKE16-NEXT: v_rndne_f32_e32 v2, v2 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_and_b32_e32 v8, 0xff800000, v8 -; GFX1150-FAKE16-NEXT: v_add_f32_e32 v6, v8, v6 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX1150-FAKE16-NEXT: v_fma_f32 v2, v2, v1, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v4, v2, v1 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v2, 11 +; GFX1150-FAKE16-NEXT: s_cbranch_scc1 .LBB10_5 +; GFX1150-FAKE16-NEXT: ; %bb.6: ; %Flow133 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, s6 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v2, v5 +; GFX1150-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX1150-FAKE16-NEXT: v_rndne_f32_e32 v3, v3 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v7, v5 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v2, v3, v1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s5 +; GFX1150-FAKE16-NEXT: .LBB10_8: +; GFX1150-FAKE16-NEXT: s_lshr_b32 s8, s5, 16 +; GFX1150-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX1150-FAKE16-NEXT: s_and_b32 s5, s8, 0x7fff +; GFX1150-FAKE16-NEXT: s_and_b32 s9, s6, 0x7fff +; GFX1150-FAKE16-NEXT: s_cvt_f32_f16 s10, s5 +; GFX1150-FAKE16-NEXT: s_cvt_f32_f16 s9, s9 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9 +; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_10 +; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s10, s9 +; GFX1150-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s8 +; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo +; GFX1150-FAKE16-NEXT: s_cbranch_execz .LBB10_11 +; GFX1150-FAKE16-NEXT: s_branch .LBB10_16 +; GFX1150-FAKE16-NEXT: .LBB10_10: +; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr1 +; GFX1150-FAKE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s9 +; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s10 +; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v2, 1 +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v3, v1, 11 +; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v1, s9 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s10, v4 +; GFX1150-FAKE16-NEXT: v_div_scale_f32 v6, null, v2, v2, 1.0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v7, v6 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v6, v6 -; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6 -; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v5, v6, v7 -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v7, v7 +; GFX1150-FAKE16-NEXT: v_not_b32_e32 v5, v1 +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v4 +; GFX1150-FAKE16-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0 +; GFX1150-FAKE16-NEXT: s_denorm_mode 15 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v7 -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_fma_f32 v8, -v6, v7, 1.0 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v7, v8, v7 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v8, v4, v7 +; GFX1150-FAKE16-NEXT: v_fma_f32 v9, -v6, v8, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v8, v9, v7 +; GFX1150-FAKE16-NEXT: v_fma_f32 v4, -v6, v8, v4 +; GFX1150-FAKE16-NEXT: s_denorm_mode 12 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_div_fmas_f32 v4, v4, v7, v8 +; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 +; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_15 +; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1150-FAKE16-NEXT: s_sub_i32 s9, s10, s9 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-FAKE16-NEXT: s_add_i32 s9, s9, 11 +; GFX1150-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX1150-FAKE16-NEXT: s_add_i32 s9, s9, -11 +; GFX1150-FAKE16-NEXT: s_cmp_gt_i32 s9, 11 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v6, v8, v7 -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v3, v6, v4 +; GFX1150-FAKE16-NEXT: v_rndne_f32_e32 v3, v3 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7 -; GFX1150-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1150-FAKE16-NEXT: v_fma_f32 v3, v3, v2, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v5, v3, v2 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v3, v3, 11 +; GFX1150-FAKE16-NEXT: s_cbranch_scc1 .LBB10_13 +; GFX1150-FAKE16-NEXT: ; %bb.14: ; %Flow129 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, s9 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v3, v6 +; GFX1150-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_add_f32_e32 v6, v7, v6 -; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX1150-FAKE16-NEXT: v_rndne_f32_e32 v4, v4 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v2, v0 -; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v6, v6 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v3, v4, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6 -; GFX1150-FAKE16-NEXT: v_fma_f16 v0, v6, v2, v0 -; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, s8 +; GFX1150-FAKE16-NEXT: .LBB10_16: +; GFX1150-FAKE16-NEXT: s_and_b32 s8, s7, 0x7fff +; GFX1150-FAKE16-NEXT: s_and_b32 s9, s2, 0x7fff +; GFX1150-FAKE16-NEXT: s_cvt_f32_f16 s10, s8 +; GFX1150-FAKE16-NEXT: s_cvt_f32_f16 s9, s9 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9 +; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_18 +; GFX1150-FAKE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s10, s9 +; GFX1150-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, s7 +; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v2, s7, v2, vcc_lo +; GFX1150-FAKE16-NEXT: s_cbranch_execz .LBB10_19 +; GFX1150-FAKE16-NEXT: s_branch .LBB10_24 +; GFX1150-FAKE16-NEXT: .LBB10_18: +; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr2 +; GFX1150-FAKE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s9 +; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s10 +; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v5 -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v7, v7 +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v3, v3, 1 +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v4, v2, 11 +; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v2, s9 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s10, v5 +; GFX1150-FAKE16-NEXT: v_div_scale_f32 v7, null, v3, v3, 1.0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s9, v2 +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v8, v7 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_not_b32_e32 v6, v2 +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX1150-FAKE16-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 +; GFX1150-FAKE16-NEXT: s_denorm_mode 15 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v7 -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v8, v9, v8 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v5, v8, v7 -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v9, v5, v8 +; GFX1150-FAKE16-NEXT: v_fma_f32 v10, -v7, v9, v5 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7 -; GFX1150-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX1150-FAKE16-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX1150-FAKE16-NEXT: s_denorm_mode 12 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 +; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_23 +; GFX1150-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX1150-FAKE16-NEXT: s_sub_i32 s9, s10, s9 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-FAKE16-NEXT: s_add_i32 s9, s9, 11 +; GFX1150-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v7, v4 +; GFX1150-FAKE16-NEXT: s_add_i32 s9, s9, -11 +; GFX1150-FAKE16-NEXT: s_cmp_gt_i32 s9, 11 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v7, v5 +; GFX1150-FAKE16-NEXT: v_rndne_f32_e32 v4, v4 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v6, v2 -; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v5, v5 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1150-FAKE16-NEXT: v_fma_f32 v4, v4, v3, v7 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v4, v4, 11 +; GFX1150-FAKE16-NEXT: s_cbranch_scc1 .LBB10_21 +; GFX1150-FAKE16-NEXT: ; %bb.22: ; %Flow125 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v6, s9 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, v7 +; GFX1150-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX1150-FAKE16-NEXT: v_rndne_f32_e32 v5, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v5, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, s7 +; GFX1150-FAKE16-NEXT: .LBB10_24: +; GFX1150-FAKE16-NEXT: s_lshr_b32 s10, s7, 16 +; GFX1150-FAKE16-NEXT: s_lshr_b32 s9, s2, 16 +; GFX1150-FAKE16-NEXT: s_and_b32 s7, s10, 0x7fff +; GFX1150-FAKE16-NEXT: s_and_b32 s11, s9, 0x7fff +; GFX1150-FAKE16-NEXT: s_cvt_f32_f16 s12, s7 +; GFX1150-FAKE16-NEXT: s_cvt_f32_f16 s11, s11 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s12, s11 +; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_26 +; GFX1150-FAKE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s12, s11 +; GFX1150-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, 0, s10 +; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v3, s10, v3, vcc_lo +; GFX1150-FAKE16-NEXT: s_cbranch_execz .LBB10_27 +; GFX1150-FAKE16-NEXT: s_branch .LBB10_32 +; GFX1150-FAKE16-NEXT: .LBB10_26: +; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr3 +; GFX1150-FAKE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v4, s11 +; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s12 +; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v4, v4, 1 +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v5, v3, 11 +; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s11 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s12, v6 +; GFX1150-FAKE16-NEXT: v_div_scale_f32 v8, null, v4, v4, 1.0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-FAKE16-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v9, v8 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5 -; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v2, v5, v6 -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v6, v6 -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX1150-FAKE16-NEXT: v_not_b32_e32 v7, v3 +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v6 +; GFX1150-FAKE16-NEXT: v_div_scale_f32 v6, vcc_lo, 1.0, v4, 1.0 +; GFX1150-FAKE16-NEXT: s_denorm_mode 15 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_f32 v10, -v8, v9, 1.0 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v9, v10, v9 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v10, v6, v9 +; GFX1150-FAKE16-NEXT: v_fma_f32 v11, -v8, v10, v6 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] -; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v5, v7, v6 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v10, v11, v9 +; GFX1150-FAKE16-NEXT: v_fma_f32 v6, -v8, v10, v6 +; GFX1150-FAKE16-NEXT: s_denorm_mode 12 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_div_fmas_f32 v6, v6, v9, v10 +; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7 +; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_31 +; GFX1150-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX1150-FAKE16-NEXT: s_sub_i32 s11, s12, s11 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-FAKE16-NEXT: s_add_i32 s11, s11, 11 +; GFX1150-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v8, v5 +; GFX1150-FAKE16-NEXT: s_add_i32 s11, s11, -11 +; GFX1150-FAKE16-NEXT: s_cmp_gt_i32 s11, 11 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v8, v6 +; GFX1150-FAKE16-NEXT: v_rndne_f32_e32 v5, v5 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX1150-FAKE16-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1150-FAKE16-NEXT: v_fma_f32 v5, v5, v4, v8 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v7, v5, v4 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v5, v5, 11 +; GFX1150-FAKE16-NEXT: s_cbranch_scc1 .LBB10_29 +; GFX1150-FAKE16-NEXT: ; %bb.30: ; %Flow +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v7, s11 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, v8 +; GFX1150-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v7, -10, v7 +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v5, v5, v7 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v3, v1 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v5, v6 +; GFX1150-FAKE16-NEXT: v_rndne_f32_e32 v6, v6 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v5, v5 -; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v5, v3 -; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX1150-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX1150-FAKE16-NEXT: v_ldexp_f32 v3, v4, v3 +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, v3, s10 +; GFX1150-FAKE16-NEXT: .LBB10_32: ; %Flow124 +; GFX1150-FAKE16-NEXT: s_cmp_lg_f16 s4, 0 +; GFX1150-FAKE16-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1150-FAKE16-NEXT: s_cmp_nge_f16 s3, 0x7c00 +; GFX1150-FAKE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: s_and_b32 vcc_lo, s3, s4 +; GFX1150-FAKE16-NEXT: s_cmp_lg_f16 s6, 0 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX1150-FAKE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1150-FAKE16-NEXT: s_cmp_nge_f16 s5, 0x7c00 +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1150-FAKE16-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: s_and_b32 vcc_lo, s4, s3 +; GFX1150-FAKE16-NEXT: s_cmp_lg_f16 s2, 0 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v1, vcc_lo +; GFX1150-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1150-FAKE16-NEXT: s_cmp_nge_f16 s8, 0x7c00 +; GFX1150-FAKE16-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX1150-FAKE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: s_and_b32 vcc_lo, s3, s2 +; GFX1150-FAKE16-NEXT: s_cmp_lg_f16 s9, 0 +; GFX1150-FAKE16-NEXT: v_dual_cndmask_b32 v1, 0x7e00, v2 :: v_dual_mov_b32 v2, 0 +; GFX1150-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1150-FAKE16-NEXT: s_cmp_nge_f16 s7, 0x7c00 +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX1150-FAKE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: s_and_b32 vcc_lo, s3, s2 +; GFX1150-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v3, vcc_lo +; GFX1150-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX1150-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1150-FAKE16-NEXT: s_endpgm ; ; GFX1200-TRUE16-LABEL: frem_v4f16: ; GFX1200-TRUE16: ; %bb.0: ; GFX1200-TRUE16-NEXT: s_clause 0x1 ; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v6, 0 +; GFX1200-TRUE16-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1200-TRUE16-NEXT: s_clause 0x1 -; GFX1200-TRUE16-NEXT: global_load_b64 v[2:3], v6, s[2:3] -; GFX1200-TRUE16-NEXT: global_load_b64 v[4:5], v6, s[4:5] offset:32 -; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h +; GFX1200-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v4.h -; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v1, v1 -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX1200-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[8:9] offset:32 +; GFX1200-TRUE16-NEXT: s_and_b32 s3, s5, 0x7fff +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1200-TRUE16-NEXT: s_cvt_f32_f16 s8, s3 +; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1200-TRUE16-NEXT: s_and_b32 s6, s4, 0x7fff +; GFX1200-TRUE16-NEXT: s_cvt_f32_f16 s6, s6 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6 +; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2 +; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s8, s6 +; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s5 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s10, -1, 0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.l, s5, v0.l, s10 +; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB10_3 +; GFX1200-TRUE16-NEXT: s_branch .LBB10_8 +; GFX1200-TRUE16-NEXT: .LBB10_2: +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr0 +; GFX1200-TRUE16-NEXT: .LBB10_3: ; %frem.compute +; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s6 +; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s8 +; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v1, v1, 1 +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v0, 11 +; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v0, s6 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s8, v3 +; GFX1200-TRUE16-NEXT: v_div_scale_f32 v5, null, v1, v1, 1.0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v6, v5 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_not_b32_e32 v4, v0 +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v3 +; GFX1200-TRUE16-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 +; GFX1200-TRUE16-NEXT: s_denorm_mode 15 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v6, v7, v6 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v1 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v7, v3, v6 +; GFX1200-TRUE16-NEXT: v_fma_f32 v8, -v5, v7, v3 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v1, v7, v1 -; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l -; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v7, v8, v6 +; GFX1200-TRUE16-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX1200-TRUE16-NEXT: s_denorm_mode 12 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v8.l, v1.l -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v4.l -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v1, v1 -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-TRUE16-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 +; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_7 +; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-TRUE16-NEXT: s_sub_co_i32 s6, s8, s6 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_add_co_i32 s6, s6, 11 +; GFX1200-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v7.l, v0.l, v8.l -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, v2 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_add_co_i32 s6, s6, -11 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_cmp_gt_i32 s6, 11 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v2, v5, v3 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v8, -v4, v0, v2 op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v8, v1 +; GFX1200-TRUE16-NEXT: v_rndne_f32_e32 v2, v2 +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v8, -v4, v0, v2 op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v1, v8, v1 +; GFX1200-TRUE16-NEXT: v_fma_f32 v2, v2, v1, v5 +; GFX1200-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v4, v2, v1 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v5.h -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v2, 11 +; GFX1200-TRUE16-NEXT: s_cbranch_scc1 .LBB10_5 +; GFX1200-TRUE16-NEXT: ; %bb.6: ; %Flow133 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v4, s6 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, v5 +; GFX1200-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX1200-TRUE16-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v2, v3, v1 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v1, v1 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s5 +; GFX1200-TRUE16-NEXT: .LBB10_8: +; GFX1200-TRUE16-NEXT: s_lshr_b32 s8, s5, 16 +; GFX1200-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_and_b32 s5, s8, 0x7fff +; GFX1200-TRUE16-NEXT: s_and_b32 s9, s6, 0x7fff +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_cvt_f32_f16 s10, s5 +; GFX1200-TRUE16-NEXT: s_cvt_f32_f16 s9, s9 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 +; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_10 +; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 +; GFX1200-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s8 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s12, -1, 0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v1.l, s8, v1.l, s12 +; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB10_11 +; GFX1200-TRUE16-NEXT: s_branch .LBB10_16 +; GFX1200-TRUE16-NEXT: .LBB10_10: +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX1200-TRUE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s9 +; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s10 +; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v2, 1 +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v1, 11 +; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v1, s9 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s10, v4 +; GFX1200-TRUE16-NEXT: v_div_scale_f32 v6, null, v2, v2, 1.0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v7, v6 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_not_b32_e32 v5, v1 +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v4 +; GFX1200-TRUE16-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0 +; GFX1200-TRUE16-NEXT: s_denorm_mode 15 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_f32 v8, -v6, v7, 1.0 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v7, v8, v7 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v0.h -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v8, v4, v7 +; GFX1200-TRUE16-NEXT: v_fma_f32 v9, -v6, v8, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX1200-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v4.l, v2.l +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v8, v9, v7 +; GFX1200-TRUE16-NEXT: v_fma_f32 v4, -v6, v8, v4 +; GFX1200-TRUE16-NEXT: s_denorm_mode 12 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-TRUE16-NEXT: v_div_fmas_f32 v4, v4, v7, v8 +; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 +; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_15 +; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1200-TRUE16-NEXT: s_sub_co_i32 s9, s10, s9 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_add_co_i32 s9, s9, 11 +; GFX1200-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v7.l -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.h -; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_2) -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_add_co_i32 s9, s9, -11 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_cmp_gt_i32 s9, 11 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v6, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v4, -v5, v0, v3 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v4, v1 +; GFX1200-TRUE16-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v4, -v5, v0, v3 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX1200-TRUE16-NEXT: v_fma_f32 v3, v3, v2, v6 +; GFX1200-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v5, v3, v2 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v3, 11 +; GFX1200-TRUE16-NEXT: s_cbranch_scc1 .LBB10_13 +; GFX1200-TRUE16-NEXT: ; %bb.14: ; %Flow129 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, s9 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v3, v6 +; GFX1200-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit28 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l -; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX1200-TRUE16-NEXT: v_rndne_f32_e32 v4, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v4, v2 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, s8 +; GFX1200-TRUE16-NEXT: .LBB10_16: +; GFX1200-TRUE16-NEXT: s_and_b32 s8, s7, 0x7fff +; GFX1200-TRUE16-NEXT: s_and_b32 s9, s2, 0x7fff +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_cvt_f32_f16 s10, s8 +; GFX1200-TRUE16-NEXT: s_cvt_f32_f16 s9, s9 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 +; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_18 +; GFX1200-TRUE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 +; GFX1200-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, s7 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s12, -1, 0 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v2.l, s7, v2.l, s12 +; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB10_19 +; GFX1200-TRUE16-NEXT: s_branch .LBB10_24 +; GFX1200-TRUE16-NEXT: .LBB10_18: +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX1200-TRUE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s9 +; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s10 +; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v3, 1 +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v4, v2, 11 +; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v2, s9 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s10, v5 +; GFX1200-TRUE16-NEXT: v_div_scale_f32 v7, null, v3, v3, 1.0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s9, v2 +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v8, v7 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_not_b32_e32 v6, v2 +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX1200-TRUE16-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 +; GFX1200-TRUE16-NEXT: s_denorm_mode 15 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v8, v9, v8 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v9, v5, v8 +; GFX1200-TRUE16-NEXT: v_fma_f32 v10, -v7, v9, v5 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX1200-TRUE16-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX1200-TRUE16-NEXT: s_denorm_mode 12 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v7.l, v1.l -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v5.l -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1200-TRUE16-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 +; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_23 +; GFX1200-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX1200-TRUE16-NEXT: s_sub_co_i32 s9, s10, s9 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_add_co_i32 s9, s9, 11 +; GFX1200-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v7, v4 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_add_co_i32 s9, s9, -11 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_cmp_gt_i32 s9, 11 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v7, v5 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_rndne_f32_e32 v4, v4 +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_f32 v4, v4, v3, v7 +; GFX1200-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v4, v4, 11 +; GFX1200-TRUE16-NEXT: s_cbranch_scc1 .LBB10_21 +; GFX1200-TRUE16-NEXT: ; %bb.22: ; %Flow125 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v6, s9 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v4, v7 +; GFX1200-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit61 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v1, v1 -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX1200-TRUE16-NEXT: v_rndne_f32_e32 v5, v5 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v4, v5, v3 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v4.l, v0.l, v7.l -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.l -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1200-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, s7 +; GFX1200-TRUE16-NEXT: .LBB10_24: +; GFX1200-TRUE16-NEXT: s_lshr_b32 s10, s7, 16 +; GFX1200-TRUE16-NEXT: s_lshr_b32 s9, s2, 16 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_and_b32 s7, s10, 0x7fff +; GFX1200-TRUE16-NEXT: s_and_b32 s11, s9, 0x7fff +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_cvt_f32_f16 s12, s7 +; GFX1200-TRUE16-NEXT: s_cvt_f32_f16 s11, s11 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s12, s11 +; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_26 +; GFX1200-TRUE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s12, s11 +; GFX1200-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, 0, s10 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v3.l, s10, v3.l, s14 +; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB10_27 +; GFX1200-TRUE16-NEXT: s_branch .LBB10_32 +; GFX1200-TRUE16-NEXT: .LBB10_26: +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX1200-TRUE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v4, s11 +; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s12 +; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v4, v4, 1 +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v5, v3, 11 +; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s11 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s12, v6 +; GFX1200-TRUE16-NEXT: v_div_scale_f32 v8, null, v4, v4, 1.0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-TRUE16-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v9, v8 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_not_b32_e32 v7, v3 +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v6 +; GFX1200-TRUE16-NEXT: v_div_scale_f32 v6, vcc_lo, 1.0, v4, 1.0 +; GFX1200-TRUE16-NEXT: s_denorm_mode 15 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_f32 v10, -v8, v9, 1.0 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v9, v10, v9 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v0, v3 op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v1 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v10, v6, v9 +; GFX1200-TRUE16-NEXT: v_fma_f32 v11, -v8, v10, v6 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v0, v3 op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v1, v7, v1 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v10, v11, v9 +; GFX1200-TRUE16-NEXT: v_fma_f32 v6, -v8, v10, v6 +; GFX1200-TRUE16-NEXT: s_denorm_mode 12 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-TRUE16-NEXT: v_div_fmas_f32 v6, v6, v9, v10 +; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7 +; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_31 +; GFX1200-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX1200-TRUE16-NEXT: s_sub_co_i32 s11, s12, s11 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_add_co_i32 s11, s11, 11 +; GFX1200-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v8, v5 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_add_co_i32 s11, s11, -11 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_cmp_gt_i32 s11, 11 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v5, v8, v6 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v5.l, v0.h +; GFX1200-TRUE16-NEXT: v_rndne_f32_e32 v5, v5 +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-TRUE16-NEXT: v_fma_f32 v5, v5, v4, v8 +; GFX1200-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v7, v5, v4 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v5.l, v3.l -; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v3, v0.l, v4.l -; GFX1200-TRUE16-NEXT: global_store_b64 v6, v[2:3], s[0:1] +; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v5, v5, 11 +; GFX1200-TRUE16-NEXT: s_cbranch_scc1 .LBB10_29 +; GFX1200-TRUE16-NEXT: ; %bb.30: ; %Flow +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v7, s11 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, v8 +; GFX1200-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v7, -10, v7 +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v5, v5, v7 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v6, v5, v6 +; GFX1200-TRUE16-NEXT: v_rndne_f32_e32 v6, v6 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v4, v3 +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.l, v3 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v3, s10 +; GFX1200-TRUE16-NEXT: .LBB10_32: ; %Flow124 +; GFX1200-TRUE16-NEXT: s_cmp_lg_f16 s4, 0 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1200-TRUE16-NEXT: s_cmp_nge_f16 s3, 0x7c00 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_and_b32 s3, s3, s4 +; GFX1200-TRUE16-NEXT: s_cmp_lg_f16 s6, 0 +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s3 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-TRUE16-NEXT: s_cmp_nge_f16 s5, 0x7c00 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_and_b32 s3, s4, s3 +; GFX1200-TRUE16-NEXT: s_cmp_lg_f16 s2, 0 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v4.l, 0x7e00, v1.l, s3 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1200-TRUE16-NEXT: s_cmp_nge_f16 s8, 0x7c00 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_and_b32 s2, s3, s2 +; GFX1200-TRUE16-NEXT: s_cmp_lg_f16 s9, 0 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, s2 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1200-TRUE16-NEXT: s_cmp_nge_f16 s7, 0x7c00 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1200-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: s_and_b32 s2, s3, s2 +; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v3.l, 0x7e00, v3.l, s2 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX1200-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1200-TRUE16-NEXT: s_endpgm ; ; GFX1200-FAKE16-LABEL: frem_v4f16: ; GFX1200-FAKE16: ; %bb.0: ; GFX1200-FAKE16-NEXT: s_clause 0x1 ; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-FAKE16-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1200-FAKE16-NEXT: s_clause 0x1 -; GFX1200-FAKE16-NEXT: global_load_b64 v[0:1], v4, s[2:3] -; GFX1200-FAKE16-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 -; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1 -; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX1200-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v8, v7 -; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v8, v8 -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v8 +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX1200-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[8:9] offset:32 +; GFX1200-FAKE16-NEXT: s_and_b32 s3, s5, 0x7fff +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1200-FAKE16-NEXT: s_cvt_f32_f16 s8, s3 +; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1200-FAKE16-NEXT: s_and_b32 s6, s4, 0x7fff +; GFX1200-FAKE16-NEXT: s_cvt_f32_f16 s6, s6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s8, s6 +; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_2 +; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s8, s6 +; GFX1200-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s5 +; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo +; GFX1200-FAKE16-NEXT: s_cbranch_execz .LBB10_3 +; GFX1200-FAKE16-NEXT: s_branch .LBB10_8 +; GFX1200-FAKE16-NEXT: .LBB10_2: +; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr0 +; GFX1200-FAKE16-NEXT: .LBB10_3: ; %frem.compute +; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s6 +; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s8 +; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v1, v1, 1 +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v0, 11 +; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v0, s6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s8, v3 +; GFX1200-FAKE16-NEXT: v_div_scale_f32 v5, null, v1, v1, 1.0 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v6, v5 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v6, v9, v8 +; GFX1200-FAKE16-NEXT: v_not_b32_e32 v4, v0 +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v3 +; GFX1200-FAKE16-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 +; GFX1200-FAKE16-NEXT: s_denorm_mode 15 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v7, v3, v6 +; GFX1200-FAKE16-NEXT: v_fma_f32 v8, -v5, v7, v3 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v7, v8, v6 +; GFX1200-FAKE16-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX1200-FAKE16-NEXT: s_denorm_mode 12 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-FAKE16-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 +; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_7 +; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-FAKE16-NEXT: s_sub_co_i32 s6, s8, s6 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_add_co_i32 s6, s6, 11 +; GFX1200-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, v2 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_add_co_i32 s6, s6, -11 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_cmp_gt_i32 s6, 11 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v2, v5, v3 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX1200-FAKE16-NEXT: v_rndne_f32_e32 v2, v2 +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_and_b32_e32 v8, 0xff800000, v8 -; GFX1200-FAKE16-NEXT: v_add_f32_e32 v6, v8, v6 +; GFX1200-FAKE16-NEXT: v_fma_f32 v2, v2, v1, v5 +; GFX1200-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v4, v2, v1 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v7, v5 +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v2, 11 +; GFX1200-FAKE16-NEXT: s_cbranch_scc1 .LBB10_5 +; GFX1200-FAKE16-NEXT: ; %bb.6: ; %Flow133 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, s6 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v2, v5 +; GFX1200-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX1200-FAKE16-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v2, v3, v1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s5 +; GFX1200-FAKE16-NEXT: .LBB10_8: +; GFX1200-FAKE16-NEXT: s_lshr_b32 s8, s5, 16 +; GFX1200-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_and_b32 s5, s8, 0x7fff +; GFX1200-FAKE16-NEXT: s_and_b32 s9, s6, 0x7fff +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_cvt_f32_f16 s10, s5 +; GFX1200-FAKE16-NEXT: s_cvt_f32_f16 s9, s9 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9 +; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_10 +; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s10, s9 +; GFX1200-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s8 +; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo +; GFX1200-FAKE16-NEXT: s_cbranch_execz .LBB10_11 +; GFX1200-FAKE16-NEXT: s_branch .LBB10_16 +; GFX1200-FAKE16-NEXT: .LBB10_10: +; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr1 +; GFX1200-FAKE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s9 +; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s10 +; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v2, 1 +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v3, v1, 11 +; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v1, s9 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s10, v4 +; GFX1200-FAKE16-NEXT: v_div_scale_f32 v6, null, v2, v2, 1.0 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v7, v6 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v6, v6 -; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6 -; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v5, v6, v7 -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v7, v7 +; GFX1200-FAKE16-NEXT: v_not_b32_e32 v5, v1 +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v4 +; GFX1200-FAKE16-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0 +; GFX1200-FAKE16-NEXT: s_denorm_mode 15 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v7 -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_fma_f32 v8, -v6, v7, 1.0 +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v7, v8, v7 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v8, v4, v7 +; GFX1200-FAKE16-NEXT: v_fma_f32 v9, -v6, v8, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v8, v9, v7 +; GFX1200-FAKE16-NEXT: v_fma_f32 v4, -v6, v8, v4 +; GFX1200-FAKE16-NEXT: s_denorm_mode 12 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-FAKE16-NEXT: v_div_fmas_f32 v4, v4, v7, v8 +; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 +; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_15 +; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1200-FAKE16-NEXT: s_sub_co_i32 s9, s10, s9 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_add_co_i32 s9, s9, 11 +; GFX1200-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_add_co_i32 s9, s9, -11 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_cmp_gt_i32 s9, 11 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v3, v6, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v6, v8, v7 -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_fma_f32 v3, v3, v2, v6 +; GFX1200-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v5, v3, v2 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7 -; GFX1200-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7 +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v3, v3, 11 +; GFX1200-FAKE16-NEXT: s_cbranch_scc1 .LBB10_13 +; GFX1200-FAKE16-NEXT: ; %bb.14: ; %Flow129 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, s9 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v3, v6 +; GFX1200-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_add_f32_e32 v6, v7, v6 -; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX1200-FAKE16-NEXT: v_rndne_f32_e32 v4, v4 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v2, v0 -; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v6, v6 +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v3, v4, v2 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6 -; GFX1200-FAKE16-NEXT: v_fma_f16 v0, v6, v2, v0 -; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, s8 +; GFX1200-FAKE16-NEXT: .LBB10_16: +; GFX1200-FAKE16-NEXT: s_and_b32 s8, s7, 0x7fff +; GFX1200-FAKE16-NEXT: s_and_b32 s9, s2, 0x7fff +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_cvt_f32_f16 s10, s8 +; GFX1200-FAKE16-NEXT: s_cvt_f32_f16 s9, s9 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9 +; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_18 +; GFX1200-FAKE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s10, s9 +; GFX1200-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, s7 +; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v2, s7, v2, vcc_lo +; GFX1200-FAKE16-NEXT: s_cbranch_execz .LBB10_19 +; GFX1200-FAKE16-NEXT: s_branch .LBB10_24 +; GFX1200-FAKE16-NEXT: .LBB10_18: +; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr2 +; GFX1200-FAKE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s9 +; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s10 +; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1200-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v5 -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v7, v7 +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v3, v3, 1 +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v4, v2, 11 +; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v2, s9 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s10, v5 +; GFX1200-FAKE16-NEXT: v_div_scale_f32 v7, null, v3, v3, 1.0 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s9, v2 +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v8, v7 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_not_b32_e32 v6, v2 +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX1200-FAKE16-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 +; GFX1200-FAKE16-NEXT: s_denorm_mode 15 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v7 -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v8, v9, v8 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v9, v5, v8 +; GFX1200-FAKE16-NEXT: v_fma_f32 v10, -v7, v9, v5 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX1200-FAKE16-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX1200-FAKE16-NEXT: s_denorm_mode 12 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-FAKE16-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 +; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_23 +; GFX1200-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX1200-FAKE16-NEXT: s_sub_co_i32 s9, s10, s9 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_add_co_i32 s9, s9, 11 +; GFX1200-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v7, v4 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_add_co_i32 s9, s9, -11 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_cmp_gt_i32 s9, 11 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v7, v5 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_rndne_f32_e32 v4, v4 +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fma_f32 v4, v4, v3, v7 +; GFX1200-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v4, v4, 11 +; GFX1200-FAKE16-NEXT: s_cbranch_scc1 .LBB10_21 +; GFX1200-FAKE16-NEXT: ; %bb.22: ; %Flow125 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v6, s9 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, v7 +; GFX1200-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX1200-FAKE16-NEXT: v_rndne_f32_e32 v5, v5 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v5, v8, v7 -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v4, v5, v3 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7 -; GFX1200-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7 +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, s7 +; GFX1200-FAKE16-NEXT: .LBB10_24: +; GFX1200-FAKE16-NEXT: s_lshr_b32 s10, s7, 16 +; GFX1200-FAKE16-NEXT: s_lshr_b32 s9, s2, 16 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_and_b32 s7, s10, 0x7fff +; GFX1200-FAKE16-NEXT: s_and_b32 s11, s9, 0x7fff +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_cvt_f32_f16 s12, s7 +; GFX1200-FAKE16-NEXT: s_cvt_f32_f16 s11, s11 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s12, s11 +; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_26 +; GFX1200-FAKE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s12, s11 +; GFX1200-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, 0, s10 +; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v3, s10, v3, vcc_lo +; GFX1200-FAKE16-NEXT: s_cbranch_execz .LBB10_27 +; GFX1200-FAKE16-NEXT: s_branch .LBB10_32 +; GFX1200-FAKE16-NEXT: .LBB10_26: +; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr3 +; GFX1200-FAKE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v4, s11 +; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s12 +; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v4, v4, 1 +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v5, v3, 11 +; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s11 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s12, v6 +; GFX1200-FAKE16-NEXT: v_div_scale_f32 v8, null, v4, v4, 1.0 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-FAKE16-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v9, v8 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX1200-FAKE16-NEXT: v_not_b32_e32 v7, v3 +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v6 +; GFX1200-FAKE16-NEXT: v_div_scale_f32 v6, vcc_lo, 1.0, v4, 1.0 +; GFX1200-FAKE16-NEXT: s_denorm_mode 15 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_fma_f32 v10, -v8, v9, 1.0 +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v9, v10, v9 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v6, v2 -; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v5, v5 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v10, v6, v9 +; GFX1200-FAKE16-NEXT: v_fma_f32 v11, -v8, v10, v6 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5 -; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v2, v5, v6 -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v6, v6 -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v10, v11, v9 +; GFX1200-FAKE16-NEXT: v_fma_f32 v6, -v8, v10, v6 +; GFX1200-FAKE16-NEXT: s_denorm_mode 12 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-FAKE16-NEXT: v_div_fmas_f32 v6, v6, v9, v10 +; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7 +; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_31 +; GFX1200-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX1200-FAKE16-NEXT: s_sub_co_i32 s11, s12, s11 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_add_co_i32 s11, s11, 11 +; GFX1200-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v8, v5 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_add_co_i32 s11, s11, -11 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_cmp_gt_i32 s11, 11 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v5, v8, v6 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] -; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v5, v7, v6 +; GFX1200-FAKE16-NEXT: v_rndne_f32_e32 v5, v5 +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1200-FAKE16-NEXT: v_fma_f32 v5, v5, v4, v8 +; GFX1200-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v7, v5, v4 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX1200-FAKE16-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v5, v5, 11 +; GFX1200-FAKE16-NEXT: s_cbranch_scc1 .LBB10_29 +; GFX1200-FAKE16-NEXT: ; %bb.30: ; %Flow +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v7, s11 +; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, v8 +; GFX1200-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v7, -10, v7 +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v5, v5, v7 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v3, v1 +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v6, v5, v6 +; GFX1200-FAKE16-NEXT: v_rndne_f32_e32 v6, v6 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v5, v5 -; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5 +; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v5, v3 -; GFX1200-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX1200-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX1200-FAKE16-NEXT: v_ldexp_f32 v3, v4, v3 +; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, v3, s10 +; GFX1200-FAKE16-NEXT: .LBB10_32: ; %Flow124 +; GFX1200-FAKE16-NEXT: s_cmp_lg_f16 s4, 0 +; GFX1200-FAKE16-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1200-FAKE16-NEXT: s_cmp_nge_f16 s3, 0x7c00 +; GFX1200-FAKE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_and_b32 vcc_lo, s3, s4 +; GFX1200-FAKE16-NEXT: s_cmp_lg_f16 s6, 0 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX1200-FAKE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-FAKE16-NEXT: s_cmp_nge_f16 s5, 0x7c00 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-FAKE16-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: s_and_b32 vcc_lo, s4, s3 +; GFX1200-FAKE16-NEXT: s_cmp_lg_f16 s2, 0 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v1, vcc_lo +; GFX1200-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1200-FAKE16-NEXT: s_cmp_nge_f16 s8, 0x7c00 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1200-FAKE16-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX1200-FAKE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-FAKE16-NEXT: s_and_b32 vcc_lo, s3, s2 +; GFX1200-FAKE16-NEXT: s_cmp_lg_f16 s9, 0 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: v_dual_cndmask_b32 v1, 0x7e00, v2 :: v_dual_mov_b32 v2, 0 +; GFX1200-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1200-FAKE16-NEXT: s_cmp_nge_f16 s7, 0x7c00 +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1200-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX1200-FAKE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-FAKE16-NEXT: s_and_b32 vcc_lo, s3, s2 +; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX1200-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v3, vcc_lo +; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX1200-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1200-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4 @@ -4082,103 +11620,357 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s8 -; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 -; SI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; SI-NEXT: v_rcp_f32_e32 v6, v5 +; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2| +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccz .LBB11_2 +; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_bfi_b32 v4, s2, 0, v0 +; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2| +; SI-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB11_3 +; SI-NEXT: s_branch .LBB11_8 +; SI-NEXT: .LBB11_2: +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB11_3: ; %frem.compute +; SI-NEXT: s_mov_b32 s6, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v0|, s6 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v0 +; SI-NEXT: s_and_b64 s[4:5], s[2:3], exec +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: s_cselect_b32 s4, s4, 0 +; SI-NEXT: v_frexp_mant_f32_e64 v4, |v0| +; SI-NEXT: v_cndmask_b32_e64 v4, |v0|, v4, s[2:3] +; SI-NEXT: v_ldexp_f32_e64 v5, v4, 12 +; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s6 +; SI-NEXT: v_frexp_mant_f32_e64 v4, |v2| +; SI-NEXT: v_cndmask_b32_e64 v4, |v2|, v4, s[2:3] +; SI-NEXT: v_frexp_exp_i32_f32_e32 v6, v2 +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-NEXT: v_readfirstlane_b32 s2, v6 +; SI-NEXT: s_cselect_b32 s5, s2, 0 +; SI-NEXT: s_add_i32 s2, s5, -1 +; SI-NEXT: v_ldexp_f32_e64 v4, v4, 1 +; SI-NEXT: s_not_b32 s3, s2 +; SI-NEXT: s_add_i32 s3, s3, s4 +; SI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v4, 1.0 +; SI-NEXT: v_div_scale_f32 v7, s[6:7], v4, v4, 1.0 +; SI-NEXT: v_rcp_f32_e32 v8, v7 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; SI-NEXT: v_fma_f32 v6, v7, v6, v6 -; SI-NEXT: v_mul_f32_e32 v7, v4, v6 -; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 -; SI-NEXT: v_fma_f32 v7, v8, v6, v7 -; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 +; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; SI-NEXT: v_fma_f32 v8, v9, v8, v8 +; SI-NEXT: v_mul_f32_e32 v9, v6, v8 +; SI-NEXT: v_fma_f32 v10, -v7, v9, v6 +; SI-NEXT: v_fma_f32 v9, v10, v8, v9 +; SI-NEXT: v_fma_f32 v6, -v7, v9, v6 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 -; SI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 -; SI-NEXT: v_trunc_f32_e32 v4, v4 -; SI-NEXT: v_fma_f32 v1, -v4, v3, v1 -; SI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 -; SI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; SI-NEXT: v_rcp_f32_e32 v5, v4 +; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 +; SI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; SI-NEXT: s_cmp_lt_i32 s3, 13 +; SI-NEXT: s_cbranch_scc1 .LBB11_7 +; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: s_sub_i32 s3, s4, s5 +; SI-NEXT: s_add_i32 s3, s3, 12 +; SI-NEXT: .LBB11_5: ; %frem.loop_body +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v7, v5 +; SI-NEXT: v_mul_f32_e32 v5, v7, v6 +; SI-NEXT: v_rndne_f32_e32 v5, v5 +; SI-NEXT: v_fma_f32 v5, -v5, v4, v7 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; SI-NEXT: v_add_f32_e32 v8, v5, v4 +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; SI-NEXT: v_ldexp_f32_e64 v5, v5, 12 +; SI-NEXT: s_add_i32 s3, s3, -12 +; SI-NEXT: s_cmp_gt_i32 s3, 12 +; SI-NEXT: s_cbranch_scc1 .LBB11_5 +; SI-NEXT: ; %bb.6: ; %Flow51 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: .LBB11_7: ; %frem.loop_exit +; SI-NEXT: s_add_i32 s3, s3, -11 +; SI-NEXT: v_ldexp_f32_e64 v5, v5, s3 +; SI-NEXT: v_mul_f32_e32 v6, v5, v6 +; SI-NEXT: v_rndne_f32_e32 v6, v6 +; SI-NEXT: v_fma_f32 v5, -v6, v4, v5 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; SI-NEXT: v_add_f32_e32 v4, v5, v4 +; SI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; SI-NEXT: v_ldexp_f32_e64 v4, v4, s2 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_bfi_b32 v4, s2, v4, v0 +; SI-NEXT: .LBB11_8: +; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3| +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccz .LBB11_10 +; SI-NEXT: ; %bb.9: ; %frem.else16 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_bfi_b32 v5, s2, 0, v1 +; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3| +; SI-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB11_11 +; SI-NEXT: s_branch .LBB11_16 +; SI-NEXT: .LBB11_10: +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB11_11: ; %frem.compute15 +; SI-NEXT: s_mov_b32 s6, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, s6 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v5, v1 +; SI-NEXT: s_and_b64 s[4:5], s[2:3], exec +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: s_cselect_b32 s4, s4, 0 +; SI-NEXT: v_frexp_mant_f32_e64 v5, |v1| +; SI-NEXT: v_cndmask_b32_e64 v5, |v1|, v5, s[2:3] +; SI-NEXT: v_ldexp_f32_e64 v6, v5, 12 +; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s6 +; SI-NEXT: v_frexp_mant_f32_e64 v5, |v3| +; SI-NEXT: v_cndmask_b32_e64 v5, |v3|, v5, s[2:3] +; SI-NEXT: v_frexp_exp_i32_f32_e32 v7, v3 +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-NEXT: v_readfirstlane_b32 s2, v7 +; SI-NEXT: s_cselect_b32 s5, s2, 0 +; SI-NEXT: s_add_i32 s2, s5, -1 +; SI-NEXT: v_ldexp_f32_e64 v5, v5, 1 +; SI-NEXT: s_not_b32 s3, s2 +; SI-NEXT: s_add_i32 s3, s3, s4 +; SI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v5, 1.0 +; SI-NEXT: v_div_scale_f32 v8, s[6:7], v5, v5, 1.0 +; SI-NEXT: v_rcp_f32_e32 v9, v8 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 -; SI-NEXT: v_fma_f32 v5, v6, v5, v5 -; SI-NEXT: v_mul_f32_e32 v6, v3, v5 -; SI-NEXT: v_fma_f32 v7, -v4, v6, v3 -; SI-NEXT: v_fma_f32 v6, v7, v5, v6 -; SI-NEXT: v_fma_f32 v3, -v4, v6, v3 +; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 +; SI-NEXT: v_fma_f32 v9, v10, v9, v9 +; SI-NEXT: v_mul_f32_e32 v10, v7, v9 +; SI-NEXT: v_fma_f32 v11, -v8, v10, v7 +; SI-NEXT: v_fma_f32 v10, v11, v9, v10 +; SI-NEXT: v_fma_f32 v7, -v8, v10, v7 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; SI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 -; SI-NEXT: v_trunc_f32_e32 v3, v3 -; SI-NEXT: v_fma_f32 v0, -v3, v2, v0 +; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 +; SI-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 +; SI-NEXT: s_cmp_lt_i32 s3, 13 +; SI-NEXT: s_cbranch_scc1 .LBB11_15 +; SI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; SI-NEXT: s_sub_i32 s3, s4, s5 +; SI-NEXT: s_add_i32 s3, s3, 12 +; SI-NEXT: .LBB11_13: ; %frem.loop_body23 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v8, v6 +; SI-NEXT: v_mul_f32_e32 v6, v8, v7 +; SI-NEXT: v_rndne_f32_e32 v6, v6 +; SI-NEXT: v_fma_f32 v6, -v6, v5, v8 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; SI-NEXT: v_add_f32_e32 v9, v6, v5 +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; SI-NEXT: v_ldexp_f32_e64 v6, v6, 12 +; SI-NEXT: s_add_i32 s3, s3, -12 +; SI-NEXT: s_cmp_gt_i32 s3, 12 +; SI-NEXT: s_cbranch_scc1 .LBB11_13 +; SI-NEXT: ; %bb.14: ; %Flow +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: .LBB11_15: ; %frem.loop_exit24 +; SI-NEXT: s_add_i32 s3, s3, -11 +; SI-NEXT: v_ldexp_f32_e64 v6, v6, s3 +; SI-NEXT: v_mul_f32_e32 v7, v6, v7 +; SI-NEXT: v_rndne_f32_e32 v7, v7 +; SI-NEXT: v_fma_f32 v6, -v7, v5, v6 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; SI-NEXT: v_add_f32_e32 v5, v6, v5 +; SI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; SI-NEXT: v_ldexp_f32_e64 v5, v5, s2 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_bfi_b32 v5, s2, v5, v1 +; SI-NEXT: .LBB11_16: ; %Flow50 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v2 +; SI-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-NEXT: v_cmp_nge_f32_e64 s[2:3], |v0|, s4 +; SI-NEXT: s_and_b64 vcc, s[2:3], vcc +; SI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v3 +; SI-NEXT: v_cmp_nge_f32_e64 s[4:5], |v1|, s4 +; SI-NEXT: s_and_b64 vcc, s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_v2f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s6, s2 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_mov_b32 s6, s10 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s8 -; CI-NEXT: s_mov_b32 s1, s9 -; CI-NEXT: s_mov_b32 s8, s10 -; CI-NEXT: s_mov_b32 s9, s11 -; CI-NEXT: s_mov_b32 s10, s2 -; CI-NEXT: s_mov_b32 s11, s3 -; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: s_mov_b32 s7, s11 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 -; CI-NEXT: v_rcp_f32_e32 v6, v5 +; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2| +; CI-NEXT: s_and_b64 vcc, exec, s[2:3] +; CI-NEXT: s_cbranch_vccz .LBB11_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_bfi_b32 v4, s2, 0, v0 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2| +; CI-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; CI-NEXT: s_cbranch_execz .LBB11_3 +; CI-NEXT: s_branch .LBB11_8 +; CI-NEXT: .LBB11_2: +; CI-NEXT: ; implicit-def: $vgpr4 +; CI-NEXT: .LBB11_3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f32_e64 v5, |v2| +; CI-NEXT: v_ldexp_f32_e64 v5, v5, 1 +; CI-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v4, |v0| +; CI-NEXT: v_frexp_exp_i32_f32_e32 v10, v2 +; CI-NEXT: v_ldexp_f32_e64 v7, v4, 12 +; CI-NEXT: v_add_i32_e32 v4, vcc, -1, v10 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v9, v0 +; CI-NEXT: v_not_b32_e32 v6, v4 +; CI-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v5, 1.0 +; CI-NEXT: v_rcp_f32_e32 v12, v11 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; CI-NEXT: v_fma_f32 v6, v7, v6, v6 -; CI-NEXT: v_mul_f32_e32 v7, v4, v6 -; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 -; CI-NEXT: v_fma_f32 v7, v8, v6, v7 -; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 +; CI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 +; CI-NEXT: v_fma_f32 v12, v13, v12, v12 +; CI-NEXT: v_mul_f32_e32 v13, v8, v12 +; CI-NEXT: v_fma_f32 v14, -v11, v13, v8 +; CI-NEXT: v_fma_f32 v13, v14, v12, v13 +; CI-NEXT: v_fma_f32 v8, -v11, v13, v8 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 -; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 -; CI-NEXT: v_trunc_f32_e32 v4, v4 -; CI-NEXT: v_fma_f32 v1, -v4, v3, v1 -; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 -; CI-NEXT: v_rcp_f32_e32 v5, v4 +; CI-NEXT: v_div_fmas_f32 v8, v8, v12, v13 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v6 +; CI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB11_7 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_sub_i32_e32 v6, vcc, v9, v10 +; CI-NEXT: v_add_i32_e32 v6, vcc, 12, v6 +; CI-NEXT: .LBB11_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v9, v7 +; CI-NEXT: v_mul_f32_e32 v7, v9, v8 +; CI-NEXT: v_rndne_f32_e32 v7, v7 +; CI-NEXT: v_fma_f32 v7, -v7, v5, v9 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; CI-NEXT: v_add_f32_e32 v10, v7, v5 +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; CI-NEXT: v_add_i32_e32 v6, vcc, -12, v6 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v6 +; CI-NEXT: v_ldexp_f32_e64 v7, v7, 12 +; CI-NEXT: s_cbranch_vccnz .LBB11_5 +; CI-NEXT: ; %bb.6: ; %Flow51 +; CI-NEXT: v_mov_b32_e32 v7, v9 +; CI-NEXT: .LBB11_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v6, vcc, -11, v6 +; CI-NEXT: v_ldexp_f32_e32 v6, v7, v6 +; CI-NEXT: v_mul_f32_e32 v7, v6, v8 +; CI-NEXT: v_rndne_f32_e32 v7, v7 +; CI-NEXT: v_fma_f32 v6, -v7, v5, v6 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; CI-NEXT: v_add_f32_e32 v5, v6, v5 +; CI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; CI-NEXT: v_ldexp_f32_e32 v4, v5, v4 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_bfi_b32 v4, s2, v4, v0 +; CI-NEXT: .LBB11_8: +; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3| +; CI-NEXT: s_and_b64 vcc, exec, s[2:3] +; CI-NEXT: s_cbranch_vccz .LBB11_10 +; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_bfi_b32 v5, s2, 0, v1 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3| +; CI-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc +; CI-NEXT: s_cbranch_execz .LBB11_11 +; CI-NEXT: s_branch .LBB11_16 +; CI-NEXT: .LBB11_10: +; CI-NEXT: ; implicit-def: $vgpr5 +; CI-NEXT: .LBB11_11: ; %frem.compute15 +; CI-NEXT: v_frexp_mant_f32_e64 v6, |v3| +; CI-NEXT: v_ldexp_f32_e64 v6, v6, 1 +; CI-NEXT: v_div_scale_f32 v12, s[2:3], v6, v6, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v5, |v1| +; CI-NEXT: v_frexp_exp_i32_f32_e32 v11, v3 +; CI-NEXT: v_ldexp_f32_e64 v8, v5, 12 +; CI-NEXT: v_add_i32_e32 v5, vcc, -1, v11 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v10, v1 +; CI-NEXT: v_not_b32_e32 v7, v5 +; CI-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v6, 1.0 +; CI-NEXT: v_rcp_f32_e32 v13, v12 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 -; CI-NEXT: v_fma_f32 v5, v6, v5, v5 -; CI-NEXT: v_mul_f32_e32 v6, v3, v5 -; CI-NEXT: v_fma_f32 v7, -v4, v6, v3 -; CI-NEXT: v_fma_f32 v6, v7, v5, v6 -; CI-NEXT: v_fma_f32 v3, -v4, v6, v3 +; CI-NEXT: v_fma_f32 v14, -v12, v13, 1.0 +; CI-NEXT: v_fma_f32 v13, v14, v13, v13 +; CI-NEXT: v_mul_f32_e32 v14, v9, v13 +; CI-NEXT: v_fma_f32 v15, -v12, v14, v9 +; CI-NEXT: v_fma_f32 v14, v15, v13, v14 +; CI-NEXT: v_fma_f32 v9, -v12, v14, v9 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 -; CI-NEXT: v_trunc_f32_e32 v3, v3 -; CI-NEXT: v_fma_f32 v0, -v3, v2, v0 +; CI-NEXT: v_div_fmas_f32 v9, v9, v13, v14 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v7 +; CI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB11_15 +; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: v_sub_i32_e32 v7, vcc, v10, v11 +; CI-NEXT: v_add_i32_e32 v7, vcc, 12, v7 +; CI-NEXT: .LBB11_13: ; %frem.loop_body23 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v10, v8 +; CI-NEXT: v_mul_f32_e32 v8, v10, v9 +; CI-NEXT: v_rndne_f32_e32 v8, v8 +; CI-NEXT: v_fma_f32 v8, -v8, v6, v10 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v8 +; CI-NEXT: v_add_f32_e32 v11, v8, v6 +; CI-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; CI-NEXT: v_add_i32_e32 v7, vcc, -12, v7 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v7 +; CI-NEXT: v_ldexp_f32_e64 v8, v8, 12 +; CI-NEXT: s_cbranch_vccnz .LBB11_13 +; CI-NEXT: ; %bb.14: ; %Flow +; CI-NEXT: v_mov_b32_e32 v8, v10 +; CI-NEXT: .LBB11_15: ; %frem.loop_exit24 +; CI-NEXT: v_add_i32_e32 v7, vcc, -11, v7 +; CI-NEXT: v_ldexp_f32_e32 v7, v8, v7 +; CI-NEXT: v_mul_f32_e32 v8, v7, v9 +; CI-NEXT: v_rndne_f32_e32 v8, v8 +; CI-NEXT: v_fma_f32 v7, -v8, v6, v7 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; CI-NEXT: v_add_f32_e32 v6, v7, v6 +; CI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; CI-NEXT: v_ldexp_f32_e32 v5, v6, v5 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v1 +; CI-NEXT: .LBB11_16: ; %Flow50 +; CI-NEXT: s_mov_b32 s4, 0x7f800000 +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v2 +; CI-NEXT: v_cmp_nge_f32_e64 s[2:3], |v0|, s4 +; CI-NEXT: s_and_b64 vcc, s[2:3], vcc +; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v3 +; CI-NEXT: v_cmp_nge_f32_e64 s[4:5], |v1|, s4 +; CI-NEXT: s_and_b64 vcc, s[4:5], vcc +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; @@ -4187,48 +11979,164 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_add_u32 s2, s4, 32 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_addc_u32 s3, s5, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3 -; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3 -; VI-NEXT: v_rcp_f32_e32 v8, v7 +; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2| +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] +; VI-NEXT: s_cbranch_vccz .LBB11_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_bfi_b32 v4, s2, 0, v0 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2| +; VI-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; VI-NEXT: s_cbranch_execz .LBB11_3 +; VI-NEXT: s_branch .LBB11_8 +; VI-NEXT: .LBB11_2: +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: .LBB11_3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f32_e64 v5, |v2| +; VI-NEXT: v_ldexp_f32 v5, v5, 1 +; VI-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v4, |v0| +; VI-NEXT: v_frexp_exp_i32_f32_e32 v10, v2 +; VI-NEXT: v_ldexp_f32 v7, v4, 12 +; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v10 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v9, v0 +; VI-NEXT: v_not_b32_e32 v6, v4 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v9 +; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v5, 1.0 +; VI-NEXT: v_rcp_f32_e32 v12, v11 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 -; VI-NEXT: v_fma_f32 v8, v9, v8, v8 -; VI-NEXT: v_mul_f32_e32 v9, v6, v8 -; VI-NEXT: v_fma_f32 v10, -v7, v9, v6 -; VI-NEXT: v_fma_f32 v9, v10, v8, v9 -; VI-NEXT: v_fma_f32 v6, -v7, v9, v6 +; VI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 +; VI-NEXT: v_fma_f32 v12, v13, v12, v12 +; VI-NEXT: v_mul_f32_e32 v13, v8, v12 +; VI-NEXT: v_fma_f32 v14, -v11, v13, v8 +; VI-NEXT: v_fma_f32 v13, v14, v12, v13 +; VI-NEXT: v_fma_f32 v8, -v11, v13, v8 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 -; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v3 -; VI-NEXT: v_trunc_f32_e32 v6, v6 -; VI-NEXT: v_fma_f32 v3, -v6, v5, v3 -; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v2 -; VI-NEXT: v_div_scale_f32 v5, vcc, v2, v4, v2 -; VI-NEXT: v_rcp_f32_e32 v7, v6 +; VI-NEXT: v_div_fmas_f32 v8, v8, v12, v13 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v6 +; VI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB11_7 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_sub_u32_e32 v6, vcc, v9, v10 +; VI-NEXT: v_add_u32_e32 v6, vcc, 12, v6 +; VI-NEXT: .LBB11_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v9, v7 +; VI-NEXT: v_mul_f32_e32 v7, v9, v8 +; VI-NEXT: v_rndne_f32_e32 v7, v7 +; VI-NEXT: v_fma_f32 v7, -v7, v5, v9 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; VI-NEXT: v_add_f32_e32 v10, v7, v5 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, -12, v6 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v6 +; VI-NEXT: v_ldexp_f32 v7, v7, 12 +; VI-NEXT: s_cbranch_vccnz .LBB11_5 +; VI-NEXT: ; %bb.6: ; %Flow51 +; VI-NEXT: v_mov_b32_e32 v7, v9 +; VI-NEXT: .LBB11_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v6, vcc, -11, v6 +; VI-NEXT: v_ldexp_f32 v6, v7, v6 +; VI-NEXT: v_mul_f32_e32 v7, v6, v8 +; VI-NEXT: v_rndne_f32_e32 v7, v7 +; VI-NEXT: v_fma_f32 v6, -v7, v5, v6 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; VI-NEXT: v_add_f32_e32 v5, v6, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; VI-NEXT: v_ldexp_f32 v4, v5, v4 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_bfi_b32 v4, s2, v4, v0 +; VI-NEXT: .LBB11_8: +; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3| +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] +; VI-NEXT: s_cbranch_vccz .LBB11_10 +; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_bfi_b32 v5, s2, 0, v1 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3| +; VI-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc +; VI-NEXT: s_cbranch_execz .LBB11_11 +; VI-NEXT: s_branch .LBB11_16 +; VI-NEXT: .LBB11_10: +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: .LBB11_11: ; %frem.compute15 +; VI-NEXT: v_frexp_mant_f32_e64 v6, |v3| +; VI-NEXT: v_ldexp_f32 v6, v6, 1 +; VI-NEXT: v_div_scale_f32 v12, s[2:3], v6, v6, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v5, |v1| +; VI-NEXT: v_frexp_exp_i32_f32_e32 v11, v3 +; VI-NEXT: v_ldexp_f32 v8, v5, 12 +; VI-NEXT: v_add_u32_e32 v5, vcc, -1, v11 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v10, v1 +; VI-NEXT: v_not_b32_e32 v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v10 +; VI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v6, 1.0 +; VI-NEXT: v_rcp_f32_e32 v13, v12 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; VI-NEXT: v_fma_f32 v7, v8, v7, v7 -; VI-NEXT: v_mul_f32_e32 v8, v5, v7 -; VI-NEXT: v_fma_f32 v9, -v6, v8, v5 -; VI-NEXT: v_fma_f32 v8, v9, v7, v8 -; VI-NEXT: v_fma_f32 v5, -v6, v8, v5 +; VI-NEXT: v_fma_f32 v14, -v12, v13, 1.0 +; VI-NEXT: v_fma_f32 v13, v14, v13, v13 +; VI-NEXT: v_mul_f32_e32 v14, v9, v13 +; VI-NEXT: v_fma_f32 v15, -v12, v14, v9 +; VI-NEXT: v_fma_f32 v14, v15, v13, v14 +; VI-NEXT: v_fma_f32 v9, -v12, v14, v9 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 -; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v2 -; VI-NEXT: v_trunc_f32_e32 v5, v5 -; VI-NEXT: v_fma_f32 v2, -v5, v4, v2 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: v_div_fmas_f32 v9, v9, v13, v14 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v7 +; VI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB11_15 +; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: v_sub_u32_e32 v7, vcc, v10, v11 +; VI-NEXT: v_add_u32_e32 v7, vcc, 12, v7 +; VI-NEXT: .LBB11_13: ; %frem.loop_body23 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v10, v8 +; VI-NEXT: v_mul_f32_e32 v8, v10, v9 +; VI-NEXT: v_rndne_f32_e32 v8, v8 +; VI-NEXT: v_fma_f32 v8, -v8, v6, v10 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v8 +; VI-NEXT: v_add_f32_e32 v11, v8, v6 +; VI-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, -12, v7 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v7 +; VI-NEXT: v_ldexp_f32 v8, v8, 12 +; VI-NEXT: s_cbranch_vccnz .LBB11_13 +; VI-NEXT: ; %bb.14: ; %Flow +; VI-NEXT: v_mov_b32_e32 v8, v10 +; VI-NEXT: .LBB11_15: ; %frem.loop_exit24 +; VI-NEXT: v_add_u32_e32 v7, vcc, -11, v7 +; VI-NEXT: v_ldexp_f32 v7, v8, v7 +; VI-NEXT: v_mul_f32_e32 v8, v7, v9 +; VI-NEXT: v_rndne_f32_e32 v8, v8 +; VI-NEXT: v_fma_f32 v7, -v8, v6, v7 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; VI-NEXT: v_add_f32_e32 v6, v7, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; VI-NEXT: v_ldexp_f32 v5, v6, v5 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_bfi_b32 v5, s2, v5, v1 +; VI-NEXT: .LBB11_16: ; %Flow50 +; VI-NEXT: s_mov_b32 s4, 0x7f800000 +; VI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v2 +; VI-NEXT: v_cmp_nge_f32_e64 s[2:3], |v0|, s4 +; VI-NEXT: s_and_b64 vcc, s[2:3], vcc +; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v3 +; VI-NEXT: v_cmp_nge_f32_e64 s[0:1], |v1|, s4 +; VI-NEXT: s_and_b64 vcc, s[0:1], vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: frem_v2f32: @@ -4240,36 +12148,153 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f32 v6, s[2:3], v3, v3, v1 -; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 -; GFX9-NEXT: v_rcp_f32_e32 v7, v6 +; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2| +; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX9-NEXT: s_cbranch_vccz .LBB11_2 +; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_bfi_b32 v4, s2, 0, v0 +; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2| +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GFX9-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-NEXT: s_branch .LBB11_8 +; GFX9-NEXT: .LBB11_2: +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: .LBB11_3: ; %frem.compute +; GFX9-NEXT: v_frexp_mant_f32_e64 v5, |v2| +; GFX9-NEXT: v_ldexp_f32 v5, v5, 1 +; GFX9-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0 +; GFX9-NEXT: v_div_scale_f32 v8, vcc, 1.0, v5, 1.0 +; GFX9-NEXT: v_frexp_mant_f32_e64 v4, |v0| +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v9, v0 +; GFX9-NEXT: v_ldexp_f32 v7, v4, 12 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v10, v2 +; GFX9-NEXT: v_add_u32_e32 v4, -1, v10 +; GFX9-NEXT: v_not_b32_e32 v6, v4 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v9 +; GFX9-NEXT: v_rcp_f32_e32 v12, v11 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX9-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; GFX9-NEXT: v_fma_f32 v7, v8, v7, v7 -; GFX9-NEXT: v_mul_f32_e32 v8, v5, v7 -; GFX9-NEXT: v_fma_f32 v9, -v6, v8, v5 -; GFX9-NEXT: v_fma_f32 v8, v9, v7, v8 -; GFX9-NEXT: v_fma_f32 v5, -v6, v8, v5 +; GFX9-NEXT: v_fma_f32 v13, -v11, v12, 1.0 +; GFX9-NEXT: v_fma_f32 v12, v13, v12, v12 +; GFX9-NEXT: v_mul_f32_e32 v13, v8, v12 +; GFX9-NEXT: v_fma_f32 v14, -v11, v13, v8 +; GFX9-NEXT: v_fma_f32 v13, v14, v12, v13 +; GFX9-NEXT: v_fma_f32 v8, -v11, v13, v8 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v8 -; GFX9-NEXT: v_div_fixup_f32 v5, v5, v3, v1 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_fma_f32 v1, -v5, v3, v1 -; GFX9-NEXT: v_div_scale_f32 v5, s[2:3], v2, v2, v0 -; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 -; GFX9-NEXT: v_rcp_f32_e32 v6, v5 +; GFX9-NEXT: v_div_fmas_f32 v8, v8, v12, v13 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v6 +; GFX9-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB11_7 +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: v_sub_u32_e32 v6, v9, v10 +; GFX9-NEXT: v_add_u32_e32 v6, 12, v6 +; GFX9-NEXT: .LBB11_5: ; %frem.loop_body +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-NEXT: v_mul_f32_e32 v7, v9, v8 +; GFX9-NEXT: v_rndne_f32_e32 v7, v7 +; GFX9-NEXT: v_fma_f32 v7, -v7, v5, v9 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; GFX9-NEXT: v_add_f32_e32 v10, v7, v5 +; GFX9-NEXT: v_add_u32_e32 v6, -12, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 12, v6 +; GFX9-NEXT: v_ldexp_f32 v7, v7, 12 +; GFX9-NEXT: s_cbranch_vccnz .LBB11_5 +; GFX9-NEXT: ; %bb.6: ; %Flow51 +; GFX9-NEXT: v_mov_b32_e32 v7, v9 +; GFX9-NEXT: .LBB11_7: ; %frem.loop_exit +; GFX9-NEXT: v_add_u32_e32 v6, -11, v6 +; GFX9-NEXT: v_ldexp_f32 v6, v7, v6 +; GFX9-NEXT: v_mul_f32_e32 v7, v6, v8 +; GFX9-NEXT: v_rndne_f32_e32 v7, v7 +; GFX9-NEXT: v_fma_f32 v6, -v7, v5, v6 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX9-NEXT: v_ldexp_f32 v4, v5, v4 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_bfi_b32 v4, s2, v4, v0 +; GFX9-NEXT: .LBB11_8: +; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3| +; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX9-NEXT: s_cbranch_vccz .LBB11_10 +; GFX9-NEXT: ; %bb.9: ; %frem.else16 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_bfi_b32 v5, s2, 0, v1 +; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3| +; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc +; GFX9-NEXT: s_cbranch_execz .LBB11_11 +; GFX9-NEXT: s_branch .LBB11_16 +; GFX9-NEXT: .LBB11_10: +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: .LBB11_11: ; %frem.compute15 +; GFX9-NEXT: v_frexp_mant_f32_e64 v6, |v3| +; GFX9-NEXT: v_ldexp_f32 v6, v6, 1 +; GFX9-NEXT: v_div_scale_f32 v12, s[2:3], v6, v6, 1.0 +; GFX9-NEXT: v_div_scale_f32 v9, vcc, 1.0, v6, 1.0 +; GFX9-NEXT: v_frexp_mant_f32_e64 v5, |v1| +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v10, v1 +; GFX9-NEXT: v_ldexp_f32 v8, v5, 12 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v11, v3 +; GFX9-NEXT: v_add_u32_e32 v5, -1, v11 +; GFX9-NEXT: v_not_b32_e32 v7, v5 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v10 +; GFX9-NEXT: v_rcp_f32_e32 v13, v12 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX9-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; GFX9-NEXT: v_fma_f32 v6, v7, v6, v6 -; GFX9-NEXT: v_mul_f32_e32 v7, v3, v6 -; GFX9-NEXT: v_fma_f32 v8, -v5, v7, v3 -; GFX9-NEXT: v_fma_f32 v7, v8, v6, v7 -; GFX9-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX9-NEXT: v_fma_f32 v14, -v12, v13, 1.0 +; GFX9-NEXT: v_fma_f32 v13, v14, v13, v13 +; GFX9-NEXT: v_mul_f32_e32 v14, v9, v13 +; GFX9-NEXT: v_fma_f32 v15, -v12, v14, v9 +; GFX9-NEXT: v_fma_f32 v14, v15, v13, v14 +; GFX9-NEXT: v_fma_f32 v9, -v12, v14, v9 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX9-NEXT: v_div_fmas_f32 v3, v3, v6, v7 -; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v0 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_fma_f32 v0, -v3, v2, v0 +; GFX9-NEXT: v_div_fmas_f32 v9, v9, v13, v14 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v7 +; GFX9-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB11_15 +; GFX9-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX9-NEXT: v_sub_u32_e32 v7, v10, v11 +; GFX9-NEXT: v_add_u32_e32 v7, 12, v7 +; GFX9-NEXT: .LBB11_13: ; %frem.loop_body23 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-NEXT: v_mul_f32_e32 v8, v10, v9 +; GFX9-NEXT: v_rndne_f32_e32 v8, v8 +; GFX9-NEXT: v_fma_f32 v8, -v8, v6, v10 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v8 +; GFX9-NEXT: v_add_f32_e32 v11, v8, v6 +; GFX9-NEXT: v_add_u32_e32 v7, -12, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 12, v7 +; GFX9-NEXT: v_ldexp_f32 v8, v8, 12 +; GFX9-NEXT: s_cbranch_vccnz .LBB11_13 +; GFX9-NEXT: ; %bb.14: ; %Flow +; GFX9-NEXT: v_mov_b32_e32 v8, v10 +; GFX9-NEXT: .LBB11_15: ; %frem.loop_exit24 +; GFX9-NEXT: v_add_u32_e32 v7, -11, v7 +; GFX9-NEXT: v_ldexp_f32 v7, v8, v7 +; GFX9-NEXT: v_mul_f32_e32 v8, v7, v9 +; GFX9-NEXT: v_rndne_f32_e32 v8, v8 +; GFX9-NEXT: v_fma_f32 v7, -v8, v6, v7 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, v7, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; GFX9-NEXT: v_ldexp_f32 v5, v6, v5 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v1 +; GFX9-NEXT: .LBB11_16: ; %Flow50 +; GFX9-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cmp_nge_f32_e64 s[2:3], |v0|, s4 +; GFX9-NEXT: s_and_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_nge_f32_e64 s[2:3], |v1|, s4 +; GFX9-NEXT: s_and_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -4284,37 +12309,154 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v6, s2, v3, v3, v1 -; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 -; GFX10-NEXT: v_rcp_f32_e32 v7, v6 +; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v2| +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_vccz .LBB11_2 +; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: v_bfi_b32 v4, 0x7fffffff, 0, v0 +; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v2| +; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB11_3 +; GFX10-NEXT: s_branch .LBB11_8 +; GFX10-NEXT: .LBB11_2: +; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: .LBB11_3: ; %frem.compute +; GFX10-NEXT: v_frexp_mant_f32_e64 v5, |v2| +; GFX10-NEXT: v_frexp_mant_f32_e64 v4, |v0| +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v7, v0 +; GFX10-NEXT: v_ldexp_f32 v5, v5, 1 +; GFX10-NEXT: v_ldexp_f32 v6, v4, 12 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v4, v2 +; GFX10-NEXT: v_readfirstlane_b32 s2, v7 +; GFX10-NEXT: v_div_scale_f32 v9, s4, v5, v5, 1.0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v4, -1, v4 +; GFX10-NEXT: v_rcp_f32_e32 v10, v9 +; GFX10-NEXT: v_not_b32_e32 v8, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v8, v8, v7 +; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, 1.0, v5, 1.0 ; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v7 -; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7 -; GFX10-NEXT: v_fma_f32 v9, -v6, v8, v5 -; GFX10-NEXT: v_fmac_f32_e32 v8, v9, v7 -; GFX10-NEXT: v_fma_f32 v5, -v6, v8, v5 +; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v10 +; GFX10-NEXT: v_mul_f32_e32 v11, v7, v10 +; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v7 +; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v10 +; GFX10-NEXT: v_fma_f32 v7, -v9, v11, v7 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v8 -; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1 -; GFX10-NEXT: v_trunc_f32_e32 v5, v5 -; GFX10-NEXT: v_fma_f32 v1, -v5, v3, v1 -; GFX10-NEXT: v_div_scale_f32 v5, s2, v2, v2, v0 -; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 -; GFX10-NEXT: v_rcp_f32_e32 v6, v5 +; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v8 +; GFX10-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 +; GFX10-NEXT: s_cbranch_vccnz .LBB11_7 +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s2, s2, 12 +; GFX10-NEXT: .LBB11_5: ; %frem.loop_body +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-NEXT: s_add_i32 s2, s2, -12 +; GFX10-NEXT: s_cmp_gt_i32 s2, 12 +; GFX10-NEXT: v_mul_f32_e32 v6, v9, v7 +; GFX10-NEXT: v_rndne_f32_e32 v6, v6 +; GFX10-NEXT: v_fma_f32 v6, -v6, v5, v9 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: v_add_f32_e32 v8, v6, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v6, v6, 12 +; GFX10-NEXT: s_cbranch_scc1 .LBB11_5 +; GFX10-NEXT: ; %bb.6: ; %Flow51 +; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: v_mov_b32_e32 v6, v9 +; GFX10-NEXT: .LBB11_7: ; %frem.loop_exit +; GFX10-NEXT: v_add_nc_u32_e32 v8, -11, v8 +; GFX10-NEXT: v_ldexp_f32 v6, v6, v8 +; GFX10-NEXT: v_mul_f32_e32 v7, v6, v7 +; GFX10-NEXT: v_rndne_f32_e32 v7, v7 +; GFX10-NEXT: v_fma_f32 v6, -v7, v5, v6 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v4, v5, v4 +; GFX10-NEXT: v_bfi_b32 v4, 0x7fffffff, v4, v0 +; GFX10-NEXT: .LBB11_8: +; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v3| +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_vccz .LBB11_10 +; GFX10-NEXT: ; %bb.9: ; %frem.else16 +; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, 0, v1 +; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v3| +; GFX10-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB11_11 +; GFX10-NEXT: s_branch .LBB11_16 +; GFX10-NEXT: .LBB11_10: +; GFX10-NEXT: ; implicit-def: $vgpr5 +; GFX10-NEXT: .LBB11_11: ; %frem.compute15 +; GFX10-NEXT: v_frexp_mant_f32_e64 v6, |v3| +; GFX10-NEXT: v_frexp_mant_f32_e64 v5, |v1| +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v8, v1 +; GFX10-NEXT: v_ldexp_f32 v6, v6, 1 +; GFX10-NEXT: v_ldexp_f32 v7, v5, 12 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v5, v3 +; GFX10-NEXT: v_readfirstlane_b32 s2, v8 +; GFX10-NEXT: v_div_scale_f32 v10, s4, v6, v6, 1.0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v5, -1, v5 +; GFX10-NEXT: v_rcp_f32_e32 v11, v10 +; GFX10-NEXT: v_not_b32_e32 v9, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v9, v9, v8 +; GFX10-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v6, 1.0 ; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v6 -; GFX10-NEXT: v_mul_f32_e32 v7, v3, v6 -; GFX10-NEXT: v_fma_f32 v8, -v5, v7, v3 -; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v6 -; GFX10-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX10-NEXT: v_fma_f32 v12, -v10, v11, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v11 +; GFX10-NEXT: v_mul_f32_e32 v12, v8, v11 +; GFX10-NEXT: v_fma_f32 v13, -v10, v12, v8 +; GFX10-NEXT: v_fmac_f32_e32 v12, v13, v11 +; GFX10-NEXT: v_fma_f32 v8, -v10, v12, v8 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v3, v3, v6, v7 -; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0 -; GFX10-NEXT: v_trunc_f32_e32 v3, v3 -; GFX10-NEXT: v_fma_f32 v0, -v3, v2, v0 -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX10-NEXT: v_div_fmas_f32 v8, v8, v11, v12 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v9 +; GFX10-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0 +; GFX10-NEXT: s_cbranch_vccnz .LBB11_15 +; GFX10-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s2, s2, 12 +; GFX10-NEXT: .LBB11_13: ; %frem.loop_body23 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v10, v7 +; GFX10-NEXT: s_add_i32 s2, s2, -12 +; GFX10-NEXT: s_cmp_gt_i32 s2, 12 +; GFX10-NEXT: v_mul_f32_e32 v7, v10, v8 +; GFX10-NEXT: v_rndne_f32_e32 v7, v7 +; GFX10-NEXT: v_fma_f32 v7, -v7, v6, v10 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v7 +; GFX10-NEXT: v_add_f32_e32 v9, v7, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v7, v7, 12 +; GFX10-NEXT: s_cbranch_scc1 .LBB11_13 +; GFX10-NEXT: ; %bb.14: ; %Flow +; GFX10-NEXT: v_mov_b32_e32 v9, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, v10 +; GFX10-NEXT: .LBB11_15: ; %frem.loop_exit24 +; GFX10-NEXT: v_add_nc_u32_e32 v9, -11, v9 +; GFX10-NEXT: v_ldexp_f32 v7, v7, v9 +; GFX10-NEXT: v_mul_f32_e32 v8, v7, v8 +; GFX10-NEXT: v_rndne_f32_e32 v8, v8 +; GFX10-NEXT: v_fma_f32 v7, -v8, v6, v7 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v7 +; GFX10-NEXT: v_add_f32_e32 v6, v7, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v5, v6, v5 +; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 +; GFX10-NEXT: .LBB11_16: ; %Flow50 +; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x7f800000, |v0| +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x7f800000, |v1| +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v4, vcc_lo +; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v3 +; GFX10-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v5, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: frem_v2f32: @@ -4322,172 +12464,650 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] -; GFX11-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: global_load_b64 v[2:3], v2, s[4:5] offset:32 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v6, null, v3, v3, v1 -; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v7, v6 +; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v2| +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccz .LBB11_2 +; GFX11-NEXT: ; %bb.1: ; %frem.else +; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, 0, v0 +; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v2| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc_lo +; GFX11-NEXT: s_cbranch_execz .LBB11_3 +; GFX11-NEXT: s_branch .LBB11_8 +; GFX11-NEXT: .LBB11_2: +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: .LBB11_3: ; %frem.compute +; GFX11-NEXT: v_frexp_mant_f32_e64 v5, |v2| +; GFX11-NEXT: v_frexp_mant_f32_e64 v4, |v0| +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v7, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_ldexp_f32 v5, v5, 1 +; GFX11-NEXT: v_ldexp_f32 v6, v4, 12 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_readfirstlane_b32 s2, v7 +; GFX11-NEXT: v_div_scale_f32 v9, null, v5, v5, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readfirstlane_b32 s3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v4, -1, v4 +; GFX11-NEXT: v_rcp_f32_e32 v10, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v8, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v7 +; GFX11-NEXT: v_div_scale_f32 v7, vcc_lo, 1.0, v5, 1.0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v7 +; GFX11-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v10, v11, v10 +; GFX11-NEXT: v_mul_f32_e32 v11, v7, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v12, -v9, v11, v7 +; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX11-NEXT: s_denorm_mode 12 +; GFX11-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_7 +; GFX11-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s2, s2, 12 +; GFX11-NEXT: .LBB11_5: ; %frem.loop_body +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v9, v6 +; GFX11-NEXT: s_add_i32 s2, s2, -12 +; GFX11-NEXT: s_cmp_gt_i32 s2, 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v8, v5, v7 -; GFX11-NEXT: v_fma_f32 v9, -v6, v8, v5 +; GFX11-NEXT: v_mul_f32_e32 v6, v9, v7 +; GFX11-NEXT: v_rndne_f32_e32 v6, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v8, v9, v7 -; GFX11-NEXT: v_fma_f32 v5, -v6, v8, v5 -; GFX11-NEXT: s_denorm_mode 12 +; GFX11-NEXT: v_fma_f32 v6, -v6, v5, v9 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v6 +; GFX11-NEXT: v_add_f32_e32 v8, v6, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v5, v5, v7, v8 -; GFX11-NEXT: v_div_fixup_f32 v5, v5, v3, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; GFX11-NEXT: v_ldexp_f32 v6, v6, 12 +; GFX11-NEXT: s_cbranch_scc1 .LBB11_5 +; GFX11-NEXT: ; %bb.6: ; %Flow51 +; GFX11-NEXT: v_mov_b32_e32 v8, s2 +; GFX11-NEXT: v_mov_b32_e32 v6, v9 +; GFX11-NEXT: .LBB11_7: ; %frem.loop_exit +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v8, -11, v8 +; GFX11-NEXT: v_ldexp_f32 v6, v6, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v5, v5 -; GFX11-NEXT: v_fma_f32 v1, -v5, v3, v1 -; GFX11-NEXT: v_div_scale_f32 v5, null, v2, v2, v0 -; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v6, v5 +; GFX11-NEXT: v_mul_f32_e32 v7, v6, v7 +; GFX11-NEXT: v_rndne_f32_e32 v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v6, -v7, v5, v6 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo +; GFX11-NEXT: v_ldexp_f32 v4, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, v4, v0 +; GFX11-NEXT: .LBB11_8: +; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v3| +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccz .LBB11_10 +; GFX11-NEXT: ; %bb.9: ; %frem.else16 +; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, 0, v1 +; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v3| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc_lo +; GFX11-NEXT: s_cbranch_execz .LBB11_11 +; GFX11-NEXT: s_branch .LBB11_16 +; GFX11-NEXT: .LBB11_10: +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: .LBB11_11: ; %frem.compute15 +; GFX11-NEXT: v_frexp_mant_f32_e64 v6, |v3| +; GFX11-NEXT: v_frexp_mant_f32_e64 v5, |v1| +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_ldexp_f32 v6, v6, 1 +; GFX11-NEXT: v_ldexp_f32 v7, v5, 12 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_readfirstlane_b32 s2, v8 +; GFX11-NEXT: v_div_scale_f32 v10, null, v6, v6, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readfirstlane_b32 s3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, -1, v5 +; GFX11-NEXT: v_rcp_f32_e32 v11, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v9, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v8 +; GFX11-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v6, 1.0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX11-NEXT: v_fma_f32 v12, -v10, v11, 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v7, v3, v6 -; GFX11-NEXT: v_fma_f32 v8, -v5, v7, v3 +; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v11 +; GFX11-NEXT: v_mul_f32_e32 v12, v8, v11 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v6 -; GFX11-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX11-NEXT: v_fma_f32 v13, -v10, v12, v8 +; GFX11-NEXT: v_fmac_f32_e32 v12, v13, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v8, -v10, v12, v8 ; GFX11-NEXT: s_denorm_mode 12 +; GFX11-NEXT: v_div_fmas_f32 v8, v8, v11, v12 +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_15 +; GFX11-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX11-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s2, s2, 12 +; GFX11-NEXT: .LBB11_13: ; %frem.loop_body23 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: s_add_i32 s2, s2, -12 +; GFX11-NEXT: s_cmp_gt_i32 s2, 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v3, v3, v6, v7 -; GFX11-NEXT: v_div_fixup_f32 v3, v3, v2, v0 +; GFX11-NEXT: v_mul_f32_e32 v7, v10, v8 +; GFX11-NEXT: v_rndne_f32_e32 v7, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v3, v3 -; GFX11-NEXT: v_fma_f32 v0, -v3, v2, v0 -; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX11-NEXT: v_fma_f32 v7, -v7, v6, v10 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v7 +; GFX11-NEXT: v_add_f32_e32 v9, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo +; GFX11-NEXT: v_ldexp_f32 v7, v7, 12 +; GFX11-NEXT: s_cbranch_scc1 .LBB11_13 +; GFX11-NEXT: ; %bb.14: ; %Flow +; GFX11-NEXT: v_mov_b32_e32 v9, s2 +; GFX11-NEXT: v_mov_b32_e32 v7, v10 +; GFX11-NEXT: .LBB11_15: ; %frem.loop_exit24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v9, -11, v9 +; GFX11-NEXT: v_ldexp_f32 v7, v7, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v8, v7, v8 +; GFX11-NEXT: v_rndne_f32_e32 v8, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v7, -v8, v6, v7 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v7 +; GFX11-NEXT: v_add_f32_e32 v6, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo +; GFX11-NEXT: v_ldexp_f32 v5, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 +; GFX11-NEXT: .LBB11_16: ; %Flow50 +; GFX11-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x7f800000, |v0| +; GFX11-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x7f800000, |v1| +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v4, vcc_lo +; GFX11-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v3 +; GFX11-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0x7fc00000, v5 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_v2f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 ; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-NEXT: v_mov_b32_e32 v4, 0 +; GFX1150-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX1150-NEXT: v_mov_b32_e32 v2, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[2:3] -; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 +; GFX1150-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_div_scale_f32 v6, null, v3, v3, v1 -; GFX1150-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) -; GFX1150-NEXT: v_rcp_f32_e32 v7, v6 -; GFX1150-NEXT: s_denorm_mode 15 -; GFX1150-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f32_e32 v7, v8, v7 -; GFX1150-NEXT: v_mul_f32_e32 v8, v5, v7 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f32 v9, -v6, v8, v5 -; GFX1150-NEXT: v_fmac_f32_e32 v8, v9, v7 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f32 v5, -v6, v8, v5 -; GFX1150-NEXT: s_denorm_mode 12 -; GFX1150-NEXT: v_div_fmas_f32 v5, v5, v7, v8 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_div_fixup_f32 v5, v5, v3, v1 -; GFX1150-NEXT: v_trunc_f32_e32 v5, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 -; GFX1150-NEXT: v_fma_f32 v1, v5, v3, v1 -; GFX1150-NEXT: v_div_scale_f32 v5, null, v2, v2, v0 -; GFX1150-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_readfirstlane_b32 s5, v1 +; GFX1150-NEXT: global_load_b64 v[1:2], v2, s[6:7] offset:32 +; GFX1150-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1150-NEXT: s_and_b32 s3, s6, 0x7fffffff +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1150-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1150-NEXT: s_and_b32 s8, s4, 0x7fffffff +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_cmp_ngt_f32 s3, s8 +; GFX1150-NEXT: s_cbranch_scc0 .LBB11_2 +; GFX1150-NEXT: ; %bb.1: ; %frem.else +; GFX1150-NEXT: s_cmp_eq_f32 s3, s8 +; GFX1150-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s6 +; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo +; GFX1150-NEXT: s_cbranch_execz .LBB11_3 +; GFX1150-NEXT: s_branch .LBB11_8 +; GFX1150-NEXT: .LBB11_2: +; GFX1150-NEXT: ; implicit-def: $vgpr0 +; GFX1150-NEXT: .LBB11_3: ; %frem.compute +; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s4| +; GFX1150-NEXT: v_frexp_mant_f32_e64 v0, |s6| +; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_ldexp_f32 v1, v1, 1 +; GFX1150-NEXT: v_ldexp_f32 v2, v0, 12 +; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v0, s4 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-NEXT: v_readfirstlane_b32 s7, v3 +; GFX1150-NEXT: v_div_scale_f32 v5, null, v1, v1, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1150-NEXT: v_add_nc_u32_e32 v0, -1, v0 ; GFX1150-NEXT: v_rcp_f32_e32 v6, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_not_b32_e32 v4, v0 +; GFX1150-NEXT: v_add_nc_u32_e32 v4, v4, v3 +; GFX1150-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 ; GFX1150-NEXT: s_denorm_mode 15 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_fmac_f32_e32 v6, v7, v6 -; GFX1150-NEXT: v_mul_f32_e32 v7, v3, v6 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v7, v3, v6 ; GFX1150-NEXT: v_fma_f32 v8, -v5, v7, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_fmac_f32_e32 v7, v8, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_fma_f32 v3, -v5, v7, v3 ; GFX1150-NEXT: s_denorm_mode 12 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4 +; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; GFX1150-NEXT: s_cbranch_vccnz .LBB11_7 +; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-NEXT: s_sub_i32 s7, s7, s8 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_add_i32 s7, s7, 12 +; GFX1150-NEXT: .LBB11_5: ; %frem.loop_body +; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-NEXT: v_mov_b32_e32 v5, v2 +; GFX1150-NEXT: s_add_i32 s7, s7, -12 +; GFX1150-NEXT: s_cmp_gt_i32 s7, 12 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v2, v0 -; GFX1150-NEXT: v_trunc_f32_e32 v3, v3 +; GFX1150-NEXT: v_mul_f32_e32 v2, v5, v3 +; GFX1150-NEXT: v_rndne_f32_e32 v2, v2 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX1150-NEXT: v_fma_f32 v2, v2, v1, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1150-NEXT: v_add_f32_e32 v4, v2, v1 +; GFX1150-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_ldexp_f32 v2, v2, 12 +; GFX1150-NEXT: s_cbranch_scc1 .LBB11_5 +; GFX1150-NEXT: ; %bb.6: ; %Flow51 +; GFX1150-NEXT: v_mov_b32_e32 v4, s7 +; GFX1150-NEXT: v_mov_b32_e32 v2, v5 +; GFX1150-NEXT: .LBB11_7: ; %frem.loop_exit +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_add_nc_u32_e32 v4, -11, v4 +; GFX1150-NEXT: v_ldexp_f32 v2, v2, v4 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX1150-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1150-NEXT: v_fmac_f32_e32 v2, v3, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1150-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1150-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1150-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, s6 +; GFX1150-NEXT: .LBB11_8: +; GFX1150-NEXT: s_and_b32 s6, s5, 0x7fffffff +; GFX1150-NEXT: s_and_b32 s8, s2, 0x7fffffff +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_cmp_ngt_f32 s6, s8 +; GFX1150-NEXT: s_cbranch_scc0 .LBB11_10 +; GFX1150-NEXT: ; %bb.9: ; %frem.else16 +; GFX1150-NEXT: s_cmp_eq_f32 s6, s8 +; GFX1150-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s5 +; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc_lo +; GFX1150-NEXT: s_cbranch_execz .LBB11_11 +; GFX1150-NEXT: s_branch .LBB11_16 +; GFX1150-NEXT: .LBB11_10: +; GFX1150-NEXT: ; implicit-def: $vgpr1 +; GFX1150-NEXT: .LBB11_11: ; %frem.compute15 +; GFX1150-NEXT: v_frexp_mant_f32_e64 v2, |s2| +; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s5| +; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v4, s5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_ldexp_f32 v2, v2, 1 +; GFX1150-NEXT: v_ldexp_f32 v3, v1, 12 +; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v1, s2 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-NEXT: v_readfirstlane_b32 s7, v4 +; GFX1150-NEXT: v_div_scale_f32 v6, null, v2, v2, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_readfirstlane_b32 s8, v1 +; GFX1150-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX1150-NEXT: v_rcp_f32_e32 v7, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_not_b32_e32 v5, v1 +; GFX1150-NEXT: v_add_nc_u32_e32 v5, v5, v4 +; GFX1150-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0 +; GFX1150-NEXT: s_denorm_mode 15 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v8, -v6, v7, 1.0 +; GFX1150-NEXT: v_fmac_f32_e32 v7, v8, v7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v8, v4, v7 +; GFX1150-NEXT: v_fma_f32 v9, -v6, v8, v4 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f32_e32 v8, v9, v7 +; GFX1150-NEXT: v_fma_f32 v4, -v6, v8, v4 +; GFX1150-NEXT: s_denorm_mode 12 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-NEXT: v_div_fmas_f32 v4, v4, v7, v8 +; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5 +; GFX1150-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; GFX1150-NEXT: s_cbranch_vccnz .LBB11_15 +; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1150-NEXT: s_sub_i32 s7, s7, s8 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_add_i32 s7, s7, 12 +; GFX1150-NEXT: .LBB11_13: ; %frem.loop_body23 +; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-NEXT: v_mov_b32_e32 v6, v3 +; GFX1150-NEXT: s_add_i32 s7, s7, -12 +; GFX1150-NEXT: s_cmp_gt_i32 s7, 12 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v3, v6, v4 +; GFX1150-NEXT: v_rndne_f32_e32 v3, v3 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX1150-NEXT: v_fmac_f32_e32 v0, v3, v2 -; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX1150-NEXT: v_fma_f32 v3, v3, v2, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1150-NEXT: v_add_f32_e32 v5, v3, v2 +; GFX1150-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_ldexp_f32 v3, v3, 12 +; GFX1150-NEXT: s_cbranch_scc1 .LBB11_13 +; GFX1150-NEXT: ; %bb.14: ; %Flow +; GFX1150-NEXT: v_mov_b32_e32 v5, s7 +; GFX1150-NEXT: v_mov_b32_e32 v3, v6 +; GFX1150-NEXT: .LBB11_15: ; %frem.loop_exit24 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_add_nc_u32_e32 v5, -11, v5 +; GFX1150-NEXT: v_ldexp_f32 v3, v3, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX1150-NEXT: v_rndne_f32_e32 v4, v4 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1150-NEXT: v_fmac_f32_e32 v3, v4, v2 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1150-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX1150-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1150-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s5 +; GFX1150-NEXT: .LBB11_16: ; %Flow50 +; GFX1150-NEXT: s_cmp_lg_f32 s4, 0 +; GFX1150-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1150-NEXT: s_cmp_nge_f32 s3, 0x7f800000 +; GFX1150-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_and_b32 vcc_lo, s3, s4 +; GFX1150-NEXT: s_cmp_lg_f32 s2, 0 +; GFX1150-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo +; GFX1150-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1150-NEXT: s_cmp_nge_f32 s6, 0x7f800000 +; GFX1150-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_and_b32 vcc_lo, s3, s2 +; GFX1150-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0x7fc00000, v1 +; GFX1150-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1150-NEXT: s_endpgm ; ; GFX1200-LABEL: frem_v2f32: ; GFX1200: ; %bb.0: ; GFX1200-NEXT: s_clause 0x1 ; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX1200-NEXT: v_mov_b32_e32 v2, 0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: s_clause 0x1 -; GFX1200-NEXT: global_load_b64 v[0:1], v4, s[2:3] -; GFX1200-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 +; GFX1200-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: v_div_scale_f32 v6, null, v3, v3, v1 -; GFX1200-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) -; GFX1200-NEXT: v_rcp_f32_e32 v7, v6 -; GFX1200-NEXT: s_denorm_mode 15 -; GFX1200-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fmac_f32_e32 v7, v8, v7 -; GFX1200-NEXT: v_mul_f32_e32 v8, v5, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f32 v9, -v6, v8, v5 -; GFX1200-NEXT: v_fmac_f32_e32 v8, v9, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f32 v5, -v6, v8, v5 -; GFX1200-NEXT: s_denorm_mode 12 -; GFX1200-NEXT: v_div_fmas_f32 v5, v5, v7, v8 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_div_fixup_f32 v5, v5, v3, v1 -; GFX1200-NEXT: v_trunc_f32_e32 v5, v5 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 -; GFX1200-NEXT: v_fma_f32 v1, v5, v3, v1 -; GFX1200-NEXT: v_div_scale_f32 v5, null, v2, v2, v0 -; GFX1200-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1200-NEXT: v_readfirstlane_b32 s5, v1 +; GFX1200-NEXT: global_load_b64 v[1:2], v2, s[6:7] offset:32 +; GFX1200-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1200-NEXT: s_and_b32 s3, s6, 0x7fffffff +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1200-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1200-NEXT: s_and_b32 s8, s4, 0x7fffffff +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_cmp_ngt_f32 s3, s8 +; GFX1200-NEXT: s_cbranch_scc0 .LBB11_2 +; GFX1200-NEXT: ; %bb.1: ; %frem.else +; GFX1200-NEXT: s_cmp_eq_f32 s3, s8 +; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s6 +; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo +; GFX1200-NEXT: s_cbranch_execz .LBB11_3 +; GFX1200-NEXT: s_branch .LBB11_8 +; GFX1200-NEXT: .LBB11_2: +; GFX1200-NEXT: ; implicit-def: $vgpr0 +; GFX1200-NEXT: .LBB11_3: ; %frem.compute +; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s4| +; GFX1200-NEXT: v_frexp_mant_f32_e64 v0, |s6| +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_ldexp_f32 v1, v1, 1 +; GFX1200-NEXT: v_ldexp_f32 v2, v0, 12 +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v0, s4 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-NEXT: v_readfirstlane_b32 s7, v3 +; GFX1200-NEXT: v_div_scale_f32 v5, null, v1, v1, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1200-NEXT: v_add_nc_u32_e32 v0, -1, v0 ; GFX1200-NEXT: v_rcp_f32_e32 v6, v5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_not_b32_e32 v4, v0 +; GFX1200-NEXT: v_add_nc_u32_e32 v4, v4, v3 +; GFX1200-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 ; GFX1200-NEXT: s_denorm_mode 15 +; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_fmac_f32_e32 v6, v7, v6 -; GFX1200-NEXT: v_mul_f32_e32 v7, v3, v6 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f32_e32 v7, v3, v6 ; GFX1200-NEXT: v_fma_f32 v8, -v5, v7, v3 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_fmac_f32_e32 v7, v8, v6 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_fma_f32 v3, -v5, v7, v3 ; GFX1200-NEXT: s_denorm_mode 12 ; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4 +; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; GFX1200-NEXT: s_cbranch_vccnz .LBB11_7 +; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-NEXT: s_sub_co_i32 s7, s7, s8 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_i32 s7, s7, 12 +; GFX1200-NEXT: .LBB11_5: ; %frem.loop_body +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_mov_b32_e32 v5, v2 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_i32 s7, s7, -12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_gt_i32 s7, 12 +; GFX1200-NEXT: v_mul_f32_e32 v2, v5, v3 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v2, v0 -; GFX1200-NEXT: v_trunc_f32_e32 v3, v3 +; GFX1200-NEXT: v_rndne_f32_e32 v2, v2 +; GFX1200-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v2, v2, v1, v5 +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1200-NEXT: v_add_f32_e32 v4, v2, v1 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX1200-NEXT: v_ldexp_f32 v2, v2, 12 +; GFX1200-NEXT: s_cbranch_scc1 .LBB11_5 +; GFX1200-NEXT: ; %bb.6: ; %Flow51 +; GFX1200-NEXT: v_mov_b32_e32 v4, s7 +; GFX1200-NEXT: v_mov_b32_e32 v2, v5 +; GFX1200-NEXT: .LBB11_7: ; %frem.loop_exit +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_add_nc_u32_e32 v4, -11, v4 +; GFX1200-NEXT: v_ldexp_f32 v2, v2, v4 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX1200-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1200-NEXT: v_fmac_f32_e32 v2, v3, v1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1200-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, s6 +; GFX1200-NEXT: .LBB11_8: +; GFX1200-NEXT: s_and_b32 s6, s5, 0x7fffffff +; GFX1200-NEXT: s_and_b32 s8, s2, 0x7fffffff +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_ngt_f32 s6, s8 +; GFX1200-NEXT: s_cbranch_scc0 .LBB11_10 +; GFX1200-NEXT: ; %bb.9: ; %frem.else16 +; GFX1200-NEXT: s_cmp_eq_f32 s6, s8 +; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s5 +; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc_lo +; GFX1200-NEXT: s_cbranch_execz .LBB11_11 +; GFX1200-NEXT: s_branch .LBB11_16 +; GFX1200-NEXT: .LBB11_10: +; GFX1200-NEXT: ; implicit-def: $vgpr1 +; GFX1200-NEXT: .LBB11_11: ; %frem.compute15 +; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s2| +; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s5| +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v4, s5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_ldexp_f32 v2, v2, 1 +; GFX1200-NEXT: v_ldexp_f32 v3, v1, 12 +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v1, s2 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 +; GFX1200-NEXT: v_div_scale_f32 v6, null, v2, v2, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_readfirstlane_b32 s8, v1 +; GFX1200-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX1200-NEXT: v_rcp_f32_e32 v7, v6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_not_b32_e32 v5, v1 +; GFX1200-NEXT: v_add_nc_u32_e32 v5, v5, v4 +; GFX1200-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0 +; GFX1200-NEXT: s_denorm_mode 15 +; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v8, -v6, v7, 1.0 +; GFX1200-NEXT: v_fmac_f32_e32 v7, v8, v7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f32_e32 v8, v4, v7 +; GFX1200-NEXT: v_fma_f32 v9, -v6, v8, v4 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fmac_f32_e32 v8, v9, v7 +; GFX1200-NEXT: v_fma_f32 v4, -v6, v8, v4 +; GFX1200-NEXT: s_denorm_mode 12 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-NEXT: v_div_fmas_f32 v4, v4, v7, v8 +; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5 +; GFX1200-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; GFX1200-NEXT: s_cbranch_vccnz .LBB11_15 +; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1200-NEXT: s_sub_co_i32 s7, s7, s8 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_i32 s7, s7, 12 +; GFX1200-NEXT: .LBB11_13: ; %frem.loop_body23 +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_mov_b32_e32 v6, v3 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_i32 s7, s7, -12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_gt_i32 s7, 12 +; GFX1200-NEXT: v_mul_f32_e32 v3, v6, v4 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_rndne_f32_e32 v3, v3 ; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX1200-NEXT: v_fmac_f32_e32 v0, v3, v2 -; GFX1200-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v3, v3, v2, v6 +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1200-NEXT: v_add_f32_e32 v5, v3, v2 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX1200-NEXT: v_ldexp_f32 v3, v3, 12 +; GFX1200-NEXT: s_cbranch_scc1 .LBB11_13 +; GFX1200-NEXT: ; %bb.14: ; %Flow +; GFX1200-NEXT: v_mov_b32_e32 v5, s7 +; GFX1200-NEXT: v_mov_b32_e32 v3, v6 +; GFX1200-NEXT: .LBB11_15: ; %frem.loop_exit24 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_add_nc_u32_e32 v5, -11, v5 +; GFX1200-NEXT: v_ldexp_f32 v3, v3, v5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX1200-NEXT: v_rndne_f32_e32 v4, v4 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1200-NEXT: v_fmac_f32_e32 v3, v4, v2 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1200-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s5 +; GFX1200-NEXT: .LBB11_16: ; %Flow50 +; GFX1200-NEXT: s_cmp_lg_f32 s4, 0 +; GFX1200-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1200-NEXT: s_cmp_nge_f32 s3, 0x7f800000 +; GFX1200-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_and_b32 vcc_lo, s3, s4 +; GFX1200-NEXT: s_cmp_lg_f32 s2, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo +; GFX1200-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1200-NEXT: s_cmp_nge_f32 s6, 0x7f800000 +; GFX1200-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_and_b32 vcc_lo, s3, s2 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0x7fc00000, v1 +; GFX1200-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1200-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4 @@ -4501,163 +13121,671 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s8 -; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 -; SI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 -; SI-NEXT: v_rcp_f32_e32 v10, v9 +; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4| +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_bfi_b32 v8, s2, 0, v0 +; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4| +; SI-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB12_3 +; SI-NEXT: s_branch .LBB12_8 +; SI-NEXT: .LBB12_2: +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB12_3: ; %frem.compute +; SI-NEXT: s_mov_b32 s6, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v0|, s6 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v8, v0 +; SI-NEXT: s_and_b64 s[4:5], s[2:3], exec +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: s_cselect_b32 s4, s4, 0 +; SI-NEXT: v_frexp_mant_f32_e64 v8, |v0| +; SI-NEXT: v_cndmask_b32_e64 v8, |v0|, v8, s[2:3] +; SI-NEXT: v_ldexp_f32_e64 v9, v8, 12 +; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s6 +; SI-NEXT: v_frexp_mant_f32_e64 v8, |v4| +; SI-NEXT: v_cndmask_b32_e64 v8, |v4|, v8, s[2:3] +; SI-NEXT: v_frexp_exp_i32_f32_e32 v10, v4 +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-NEXT: v_readfirstlane_b32 s2, v10 +; SI-NEXT: s_cselect_b32 s5, s2, 0 +; SI-NEXT: s_add_i32 s2, s5, -1 +; SI-NEXT: v_ldexp_f32_e64 v8, v8, 1 +; SI-NEXT: s_not_b32 s3, s2 +; SI-NEXT: s_add_i32 s3, s3, s4 +; SI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v8, 1.0 +; SI-NEXT: v_div_scale_f32 v11, s[6:7], v8, v8, 1.0 +; SI-NEXT: v_rcp_f32_e32 v12, v11 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 -; SI-NEXT: v_fma_f32 v10, v11, v10, v10 -; SI-NEXT: v_mul_f32_e32 v11, v8, v10 -; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 -; SI-NEXT: v_fma_f32 v11, v12, v10, v11 -; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 +; SI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 +; SI-NEXT: v_fma_f32 v12, v13, v12, v12 +; SI-NEXT: v_mul_f32_e32 v13, v10, v12 +; SI-NEXT: v_fma_f32 v14, -v11, v13, v10 +; SI-NEXT: v_fma_f32 v13, v14, v12, v13 +; SI-NEXT: v_fma_f32 v10, -v11, v13, v10 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 -; SI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 -; SI-NEXT: v_trunc_f32_e32 v8, v8 -; SI-NEXT: v_fma_f32 v3, -v8, v7, v3 -; SI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 -; SI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 -; SI-NEXT: v_rcp_f32_e32 v9, v8 +; SI-NEXT: v_div_fmas_f32 v10, v10, v12, v13 +; SI-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0 +; SI-NEXT: s_cmp_lt_i32 s3, 13 +; SI-NEXT: s_cbranch_scc1 .LBB12_7 +; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: s_sub_i32 s3, s4, s5 +; SI-NEXT: s_add_i32 s3, s3, 12 +; SI-NEXT: .LBB12_5: ; %frem.loop_body +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v11, v9 +; SI-NEXT: v_mul_f32_e32 v9, v11, v10 +; SI-NEXT: v_rndne_f32_e32 v9, v9 +; SI-NEXT: v_fma_f32 v9, -v9, v8, v11 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v9 +; SI-NEXT: v_add_f32_e32 v12, v9, v8 +; SI-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc +; SI-NEXT: v_ldexp_f32_e64 v9, v9, 12 +; SI-NEXT: s_add_i32 s3, s3, -12 +; SI-NEXT: s_cmp_gt_i32 s3, 12 +; SI-NEXT: s_cbranch_scc1 .LBB12_5 +; SI-NEXT: ; %bb.6: ; %Flow125 +; SI-NEXT: v_mov_b32_e32 v9, v11 +; SI-NEXT: .LBB12_7: ; %frem.loop_exit +; SI-NEXT: s_add_i32 s3, s3, -11 +; SI-NEXT: v_ldexp_f32_e64 v9, v9, s3 +; SI-NEXT: v_mul_f32_e32 v10, v9, v10 +; SI-NEXT: v_rndne_f32_e32 v10, v10 +; SI-NEXT: v_fma_f32 v9, -v10, v8, v9 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v9 +; SI-NEXT: v_add_f32_e32 v8, v9, v8 +; SI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; SI-NEXT: v_ldexp_f32_e64 v8, v8, s2 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_bfi_b32 v8, s2, v8, v0 +; SI-NEXT: .LBB12_8: +; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5| +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccz .LBB12_10 +; SI-NEXT: ; %bb.9: ; %frem.else16 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_bfi_b32 v9, s2, 0, v1 +; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5| +; SI-NEXT: v_cndmask_b32_e32 v9, v1, v9, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB12_11 +; SI-NEXT: s_branch .LBB12_16 +; SI-NEXT: .LBB12_10: +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB12_11: ; %frem.compute15 +; SI-NEXT: s_mov_b32 s6, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, s6 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v9, v1 +; SI-NEXT: s_and_b64 s[4:5], s[2:3], exec +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: s_cselect_b32 s4, s4, 0 +; SI-NEXT: v_frexp_mant_f32_e64 v9, |v1| +; SI-NEXT: v_cndmask_b32_e64 v9, |v1|, v9, s[2:3] +; SI-NEXT: v_ldexp_f32_e64 v10, v9, 12 +; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, s6 +; SI-NEXT: v_frexp_mant_f32_e64 v9, |v5| +; SI-NEXT: v_cndmask_b32_e64 v9, |v5|, v9, s[2:3] +; SI-NEXT: v_frexp_exp_i32_f32_e32 v11, v5 +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-NEXT: v_readfirstlane_b32 s2, v11 +; SI-NEXT: s_cselect_b32 s5, s2, 0 +; SI-NEXT: s_add_i32 s2, s5, -1 +; SI-NEXT: v_ldexp_f32_e64 v9, v9, 1 +; SI-NEXT: s_not_b32 s3, s2 +; SI-NEXT: s_add_i32 s3, s3, s4 +; SI-NEXT: v_div_scale_f32 v11, vcc, 1.0, v9, 1.0 +; SI-NEXT: v_div_scale_f32 v12, s[6:7], v9, v9, 1.0 +; SI-NEXT: v_rcp_f32_e32 v13, v12 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 -; SI-NEXT: v_fma_f32 v9, v10, v9, v9 -; SI-NEXT: v_mul_f32_e32 v10, v7, v9 -; SI-NEXT: v_fma_f32 v11, -v8, v10, v7 -; SI-NEXT: v_fma_f32 v10, v11, v9, v10 -; SI-NEXT: v_fma_f32 v7, -v8, v10, v7 +; SI-NEXT: v_fma_f32 v14, -v12, v13, 1.0 +; SI-NEXT: v_fma_f32 v13, v14, v13, v13 +; SI-NEXT: v_mul_f32_e32 v14, v11, v13 +; SI-NEXT: v_fma_f32 v15, -v12, v14, v11 +; SI-NEXT: v_fma_f32 v14, v15, v13, v14 +; SI-NEXT: v_fma_f32 v11, -v12, v14, v11 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 -; SI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 -; SI-NEXT: v_trunc_f32_e32 v7, v7 -; SI-NEXT: v_fma_f32 v2, -v7, v6, v2 -; SI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 -; SI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 -; SI-NEXT: v_rcp_f32_e32 v8, v7 +; SI-NEXT: v_div_fmas_f32 v11, v11, v13, v14 +; SI-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 +; SI-NEXT: s_cmp_lt_i32 s3, 13 +; SI-NEXT: s_cbranch_scc1 .LBB12_15 +; SI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; SI-NEXT: s_sub_i32 s3, s4, s5 +; SI-NEXT: s_add_i32 s3, s3, 12 +; SI-NEXT: .LBB12_13: ; %frem.loop_body23 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v12, v10 +; SI-NEXT: v_mul_f32_e32 v10, v12, v11 +; SI-NEXT: v_rndne_f32_e32 v10, v10 +; SI-NEXT: v_fma_f32 v10, -v10, v9, v12 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 +; SI-NEXT: v_add_f32_e32 v13, v10, v9 +; SI-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; SI-NEXT: v_ldexp_f32_e64 v10, v10, 12 +; SI-NEXT: s_add_i32 s3, s3, -12 +; SI-NEXT: s_cmp_gt_i32 s3, 12 +; SI-NEXT: s_cbranch_scc1 .LBB12_13 +; SI-NEXT: ; %bb.14: ; %Flow121 +; SI-NEXT: v_mov_b32_e32 v10, v12 +; SI-NEXT: .LBB12_15: ; %frem.loop_exit24 +; SI-NEXT: s_add_i32 s3, s3, -11 +; SI-NEXT: v_ldexp_f32_e64 v10, v10, s3 +; SI-NEXT: v_mul_f32_e32 v11, v10, v11 +; SI-NEXT: v_rndne_f32_e32 v11, v11 +; SI-NEXT: v_fma_f32 v10, -v11, v9, v10 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 +; SI-NEXT: v_add_f32_e32 v9, v10, v9 +; SI-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; SI-NEXT: v_ldexp_f32_e64 v9, v9, s2 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_bfi_b32 v9, s2, v9, v1 +; SI-NEXT: .LBB12_16: +; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6| +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccz .LBB12_18 +; SI-NEXT: ; %bb.17: ; %frem.else47 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_bfi_b32 v10, s2, 0, v2 +; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6| +; SI-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB12_19 +; SI-NEXT: s_branch .LBB12_24 +; SI-NEXT: .LBB12_18: +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB12_19: ; %frem.compute46 +; SI-NEXT: s_mov_b32 s6, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s6 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v10, v2 +; SI-NEXT: s_and_b64 s[4:5], s[2:3], exec +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: s_cselect_b32 s4, s4, 0 +; SI-NEXT: v_frexp_mant_f32_e64 v10, |v2| +; SI-NEXT: v_cndmask_b32_e64 v10, |v2|, v10, s[2:3] +; SI-NEXT: v_ldexp_f32_e64 v11, v10, 12 +; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, s6 +; SI-NEXT: v_frexp_mant_f32_e64 v10, |v6| +; SI-NEXT: v_cndmask_b32_e64 v10, |v6|, v10, s[2:3] +; SI-NEXT: v_frexp_exp_i32_f32_e32 v12, v6 +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-NEXT: v_readfirstlane_b32 s2, v12 +; SI-NEXT: s_cselect_b32 s5, s2, 0 +; SI-NEXT: s_add_i32 s2, s5, -1 +; SI-NEXT: v_ldexp_f32_e64 v10, v10, 1 +; SI-NEXT: s_not_b32 s3, s2 +; SI-NEXT: s_add_i32 s3, s3, s4 +; SI-NEXT: v_div_scale_f32 v12, vcc, 1.0, v10, 1.0 +; SI-NEXT: v_div_scale_f32 v13, s[6:7], v10, v10, 1.0 +; SI-NEXT: v_rcp_f32_e32 v14, v13 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 -; SI-NEXT: v_fma_f32 v8, v9, v8, v8 -; SI-NEXT: v_mul_f32_e32 v9, v6, v8 -; SI-NEXT: v_fma_f32 v10, -v7, v9, v6 -; SI-NEXT: v_fma_f32 v9, v10, v8, v9 -; SI-NEXT: v_fma_f32 v6, -v7, v9, v6 +; SI-NEXT: v_fma_f32 v15, -v13, v14, 1.0 +; SI-NEXT: v_fma_f32 v14, v15, v14, v14 +; SI-NEXT: v_mul_f32_e32 v15, v12, v14 +; SI-NEXT: v_fma_f32 v16, -v13, v15, v12 +; SI-NEXT: v_fma_f32 v15, v16, v14, v15 +; SI-NEXT: v_fma_f32 v12, -v13, v15, v12 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 -; SI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 -; SI-NEXT: v_trunc_f32_e32 v6, v6 -; SI-NEXT: v_fma_f32 v1, -v6, v5, v1 -; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 -; SI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 -; SI-NEXT: v_rcp_f32_e32 v7, v6 +; SI-NEXT: v_div_fmas_f32 v12, v12, v14, v15 +; SI-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0 +; SI-NEXT: s_cmp_lt_i32 s3, 13 +; SI-NEXT: s_cbranch_scc1 .LBB12_23 +; SI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; SI-NEXT: s_sub_i32 s3, s4, s5 +; SI-NEXT: s_add_i32 s3, s3, 12 +; SI-NEXT: .LBB12_21: ; %frem.loop_body54 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v13, v11 +; SI-NEXT: v_mul_f32_e32 v11, v13, v12 +; SI-NEXT: v_rndne_f32_e32 v11, v11 +; SI-NEXT: v_fma_f32 v11, -v11, v10, v13 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 +; SI-NEXT: v_add_f32_e32 v14, v11, v10 +; SI-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; SI-NEXT: v_ldexp_f32_e64 v11, v11, 12 +; SI-NEXT: s_add_i32 s3, s3, -12 +; SI-NEXT: s_cmp_gt_i32 s3, 12 +; SI-NEXT: s_cbranch_scc1 .LBB12_21 +; SI-NEXT: ; %bb.22: ; %Flow117 +; SI-NEXT: v_mov_b32_e32 v11, v13 +; SI-NEXT: .LBB12_23: ; %frem.loop_exit55 +; SI-NEXT: s_add_i32 s3, s3, -11 +; SI-NEXT: v_ldexp_f32_e64 v11, v11, s3 +; SI-NEXT: v_mul_f32_e32 v12, v11, v12 +; SI-NEXT: v_rndne_f32_e32 v12, v12 +; SI-NEXT: v_fma_f32 v11, -v12, v10, v11 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 +; SI-NEXT: v_add_f32_e32 v10, v11, v10 +; SI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; SI-NEXT: v_ldexp_f32_e64 v10, v10, s2 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_bfi_b32 v10, s2, v10, v2 +; SI-NEXT: .LBB12_24: +; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7| +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccz .LBB12_26 +; SI-NEXT: ; %bb.25: ; %frem.else78 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_bfi_b32 v11, s2, 0, v3 +; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7| +; SI-NEXT: v_cndmask_b32_e32 v11, v3, v11, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB12_27 +; SI-NEXT: s_branch .LBB12_32 +; SI-NEXT: .LBB12_26: +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB12_27: ; %frem.compute77 +; SI-NEXT: s_mov_b32 s6, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s6 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v11, v3 +; SI-NEXT: s_and_b64 s[4:5], s[2:3], exec +; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: s_cselect_b32 s4, s4, 0 +; SI-NEXT: v_frexp_mant_f32_e64 v11, |v3| +; SI-NEXT: v_cndmask_b32_e64 v11, |v3|, v11, s[2:3] +; SI-NEXT: v_ldexp_f32_e64 v12, v11, 12 +; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v7|, s6 +; SI-NEXT: v_frexp_mant_f32_e64 v11, |v7| +; SI-NEXT: v_cndmask_b32_e64 v11, |v7|, v11, s[2:3] +; SI-NEXT: v_frexp_exp_i32_f32_e32 v13, v7 +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-NEXT: v_readfirstlane_b32 s2, v13 +; SI-NEXT: s_cselect_b32 s5, s2, 0 +; SI-NEXT: s_add_i32 s2, s5, -1 +; SI-NEXT: v_ldexp_f32_e64 v11, v11, 1 +; SI-NEXT: s_not_b32 s3, s2 +; SI-NEXT: s_add_i32 s3, s3, s4 +; SI-NEXT: v_div_scale_f32 v13, vcc, 1.0, v11, 1.0 +; SI-NEXT: v_div_scale_f32 v14, s[6:7], v11, v11, 1.0 +; SI-NEXT: v_rcp_f32_e32 v15, v14 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; SI-NEXT: v_fma_f32 v7, v8, v7, v7 -; SI-NEXT: v_mul_f32_e32 v8, v5, v7 -; SI-NEXT: v_fma_f32 v9, -v6, v8, v5 -; SI-NEXT: v_fma_f32 v8, v9, v7, v8 -; SI-NEXT: v_fma_f32 v5, -v6, v8, v5 +; SI-NEXT: v_fma_f32 v16, -v14, v15, 1.0 +; SI-NEXT: v_fma_f32 v15, v16, v15, v15 +; SI-NEXT: v_mul_f32_e32 v16, v13, v15 +; SI-NEXT: v_fma_f32 v17, -v14, v16, v13 +; SI-NEXT: v_fma_f32 v16, v17, v15, v16 +; SI-NEXT: v_fma_f32 v13, -v14, v16, v13 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 -; SI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 -; SI-NEXT: v_trunc_f32_e32 v5, v5 -; SI-NEXT: v_fma_f32 v0, -v5, v4, v0 +; SI-NEXT: v_div_fmas_f32 v13, v13, v15, v16 +; SI-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0 +; SI-NEXT: s_cmp_lt_i32 s3, 13 +; SI-NEXT: s_cbranch_scc1 .LBB12_31 +; SI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; SI-NEXT: s_sub_i32 s3, s4, s5 +; SI-NEXT: s_add_i32 s3, s3, 12 +; SI-NEXT: .LBB12_29: ; %frem.loop_body85 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v14, v12 +; SI-NEXT: v_mul_f32_e32 v12, v14, v13 +; SI-NEXT: v_rndne_f32_e32 v12, v12 +; SI-NEXT: v_fma_f32 v12, -v12, v11, v14 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 +; SI-NEXT: v_add_f32_e32 v15, v12, v11 +; SI-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc +; SI-NEXT: v_ldexp_f32_e64 v12, v12, 12 +; SI-NEXT: s_add_i32 s3, s3, -12 +; SI-NEXT: s_cmp_gt_i32 s3, 12 +; SI-NEXT: s_cbranch_scc1 .LBB12_29 +; SI-NEXT: ; %bb.30: ; %Flow +; SI-NEXT: v_mov_b32_e32 v12, v14 +; SI-NEXT: .LBB12_31: ; %frem.loop_exit86 +; SI-NEXT: s_add_i32 s3, s3, -11 +; SI-NEXT: v_ldexp_f32_e64 v12, v12, s3 +; SI-NEXT: v_mul_f32_e32 v13, v12, v13 +; SI-NEXT: v_rndne_f32_e32 v13, v13 +; SI-NEXT: v_fma_f32 v12, -v13, v11, v12 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 +; SI-NEXT: v_add_f32_e32 v11, v12, v11 +; SI-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc +; SI-NEXT: v_ldexp_f32_e64 v11, v11, s2 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_bfi_b32 v11, s2, v11, v3 +; SI-NEXT: .LBB12_32: ; %Flow116 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v4 +; SI-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-NEXT: v_cmp_nge_f32_e64 s[2:3], |v0|, s4 +; SI-NEXT: s_and_b64 vcc, s[2:3], vcc +; SI-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v5 +; SI-NEXT: v_cmp_nge_f32_e64 s[2:3], |v1|, s4 +; SI-NEXT: s_and_b64 vcc, s[2:3], vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v6 +; SI-NEXT: v_cmp_nge_f32_e64 s[2:3], |v2|, s4 +; SI-NEXT: s_and_b64 vcc, s[2:3], vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v10, vcc +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v7 +; SI-NEXT: v_cmp_nge_f32_e64 s[4:5], |v3|, s4 +; SI-NEXT: s_and_b64 vcc, s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e32 v3, v4, v11, vcc ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_v4f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s6, s2 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_mov_b32 s6, s10 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s8 -; CI-NEXT: s_mov_b32 s1, s9 -; CI-NEXT: s_mov_b32 s8, s10 -; CI-NEXT: s_mov_b32 s9, s11 -; CI-NEXT: s_mov_b32 s10, s2 -; CI-NEXT: s_mov_b32 s11, s3 -; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: s_mov_b32 s7, s11 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:64 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 -; CI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 -; CI-NEXT: v_rcp_f32_e32 v10, v9 +; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4| +; CI-NEXT: s_and_b64 vcc, exec, s[2:3] +; CI-NEXT: s_cbranch_vccz .LBB12_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_bfi_b32 v8, s2, 0, v0 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4| +; CI-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc +; CI-NEXT: s_cbranch_execz .LBB12_3 +; CI-NEXT: s_branch .LBB12_8 +; CI-NEXT: .LBB12_2: +; CI-NEXT: ; implicit-def: $vgpr8 +; CI-NEXT: .LBB12_3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f32_e64 v9, |v4| +; CI-NEXT: v_ldexp_f32_e64 v9, v9, 1 +; CI-NEXT: v_div_scale_f32 v15, s[2:3], v9, v9, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v8, |v0| +; CI-NEXT: v_frexp_exp_i32_f32_e32 v14, v4 +; CI-NEXT: v_ldexp_f32_e64 v11, v8, 12 +; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v14 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v13, v0 +; CI-NEXT: v_not_b32_e32 v10, v8 +; CI-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CI-NEXT: v_div_scale_f32 v12, vcc, 1.0, v9, 1.0 +; CI-NEXT: v_rcp_f32_e32 v16, v15 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 -; CI-NEXT: v_fma_f32 v10, v11, v10, v10 -; CI-NEXT: v_mul_f32_e32 v11, v8, v10 -; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 -; CI-NEXT: v_fma_f32 v11, v12, v10, v11 -; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 +; CI-NEXT: v_fma_f32 v17, -v15, v16, 1.0 +; CI-NEXT: v_fma_f32 v16, v17, v16, v16 +; CI-NEXT: v_mul_f32_e32 v17, v12, v16 +; CI-NEXT: v_fma_f32 v18, -v15, v17, v12 +; CI-NEXT: v_fma_f32 v17, v18, v16, v17 +; CI-NEXT: v_fma_f32 v12, -v15, v17, v12 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 -; CI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 -; CI-NEXT: v_trunc_f32_e32 v8, v8 -; CI-NEXT: v_fma_f32 v3, -v8, v7, v3 -; CI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 -; CI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 -; CI-NEXT: v_rcp_f32_e32 v9, v8 +; CI-NEXT: v_div_fmas_f32 v12, v12, v16, v17 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v10 +; CI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB12_7 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_sub_i32_e32 v10, vcc, v13, v14 +; CI-NEXT: v_add_i32_e32 v10, vcc, 12, v10 +; CI-NEXT: .LBB12_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v13, v11 +; CI-NEXT: v_mul_f32_e32 v11, v13, v12 +; CI-NEXT: v_rndne_f32_e32 v11, v11 +; CI-NEXT: v_fma_f32 v11, -v11, v9, v13 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 +; CI-NEXT: v_add_f32_e32 v14, v11, v9 +; CI-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; CI-NEXT: v_add_i32_e32 v10, vcc, -12, v10 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v10 +; CI-NEXT: v_ldexp_f32_e64 v11, v11, 12 +; CI-NEXT: s_cbranch_vccnz .LBB12_5 +; CI-NEXT: ; %bb.6: ; %Flow125 +; CI-NEXT: v_mov_b32_e32 v11, v13 +; CI-NEXT: .LBB12_7: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v10, vcc, -11, v10 +; CI-NEXT: v_ldexp_f32_e32 v10, v11, v10 +; CI-NEXT: v_mul_f32_e32 v11, v10, v12 +; CI-NEXT: v_rndne_f32_e32 v11, v11 +; CI-NEXT: v_fma_f32 v10, -v11, v9, v10 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 +; CI-NEXT: v_add_f32_e32 v9, v10, v9 +; CI-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; CI-NEXT: v_ldexp_f32_e32 v8, v9, v8 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_bfi_b32 v8, s2, v8, v0 +; CI-NEXT: .LBB12_8: +; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5| +; CI-NEXT: s_and_b64 vcc, exec, s[2:3] +; CI-NEXT: s_cbranch_vccz .LBB12_10 +; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_bfi_b32 v9, s2, 0, v1 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5| +; CI-NEXT: v_cndmask_b32_e32 v9, v1, v9, vcc +; CI-NEXT: s_cbranch_execz .LBB12_11 +; CI-NEXT: s_branch .LBB12_16 +; CI-NEXT: .LBB12_10: +; CI-NEXT: ; implicit-def: $vgpr9 +; CI-NEXT: .LBB12_11: ; %frem.compute15 +; CI-NEXT: v_frexp_mant_f32_e64 v10, |v5| +; CI-NEXT: v_ldexp_f32_e64 v10, v10, 1 +; CI-NEXT: v_div_scale_f32 v16, s[2:3], v10, v10, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v9, |v1| +; CI-NEXT: v_frexp_exp_i32_f32_e32 v15, v5 +; CI-NEXT: v_ldexp_f32_e64 v12, v9, 12 +; CI-NEXT: v_add_i32_e32 v9, vcc, -1, v15 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v14, v1 +; CI-NEXT: v_not_b32_e32 v11, v9 +; CI-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CI-NEXT: v_div_scale_f32 v13, vcc, 1.0, v10, 1.0 +; CI-NEXT: v_rcp_f32_e32 v17, v16 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 -; CI-NEXT: v_fma_f32 v9, v10, v9, v9 -; CI-NEXT: v_mul_f32_e32 v10, v7, v9 -; CI-NEXT: v_fma_f32 v11, -v8, v10, v7 -; CI-NEXT: v_fma_f32 v10, v11, v9, v10 -; CI-NEXT: v_fma_f32 v7, -v8, v10, v7 +; CI-NEXT: v_fma_f32 v18, -v16, v17, 1.0 +; CI-NEXT: v_fma_f32 v17, v18, v17, v17 +; CI-NEXT: v_mul_f32_e32 v18, v13, v17 +; CI-NEXT: v_fma_f32 v19, -v16, v18, v13 +; CI-NEXT: v_fma_f32 v18, v19, v17, v18 +; CI-NEXT: v_fma_f32 v13, -v16, v18, v13 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 -; CI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 -; CI-NEXT: v_trunc_f32_e32 v7, v7 -; CI-NEXT: v_fma_f32 v2, -v7, v6, v2 -; CI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 -; CI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 -; CI-NEXT: v_rcp_f32_e32 v8, v7 +; CI-NEXT: v_div_fmas_f32 v13, v13, v17, v18 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v11 +; CI-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB12_15 +; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: v_sub_i32_e32 v11, vcc, v14, v15 +; CI-NEXT: v_add_i32_e32 v11, vcc, 12, v11 +; CI-NEXT: .LBB12_13: ; %frem.loop_body23 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v14, v12 +; CI-NEXT: v_mul_f32_e32 v12, v14, v13 +; CI-NEXT: v_rndne_f32_e32 v12, v12 +; CI-NEXT: v_fma_f32 v12, -v12, v10, v14 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 +; CI-NEXT: v_add_f32_e32 v15, v12, v10 +; CI-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc +; CI-NEXT: v_add_i32_e32 v11, vcc, -12, v11 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v11 +; CI-NEXT: v_ldexp_f32_e64 v12, v12, 12 +; CI-NEXT: s_cbranch_vccnz .LBB12_13 +; CI-NEXT: ; %bb.14: ; %Flow121 +; CI-NEXT: v_mov_b32_e32 v12, v14 +; CI-NEXT: .LBB12_15: ; %frem.loop_exit24 +; CI-NEXT: v_add_i32_e32 v11, vcc, -11, v11 +; CI-NEXT: v_ldexp_f32_e32 v11, v12, v11 +; CI-NEXT: v_mul_f32_e32 v12, v11, v13 +; CI-NEXT: v_rndne_f32_e32 v12, v12 +; CI-NEXT: v_fma_f32 v11, -v12, v10, v11 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 +; CI-NEXT: v_add_f32_e32 v10, v11, v10 +; CI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; CI-NEXT: v_ldexp_f32_e32 v9, v10, v9 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_bfi_b32 v9, s2, v9, v1 +; CI-NEXT: .LBB12_16: +; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6| +; CI-NEXT: s_and_b64 vcc, exec, s[2:3] +; CI-NEXT: s_cbranch_vccz .LBB12_18 +; CI-NEXT: ; %bb.17: ; %frem.else47 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_bfi_b32 v10, s2, 0, v2 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6| +; CI-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc +; CI-NEXT: s_cbranch_execz .LBB12_19 +; CI-NEXT: s_branch .LBB12_24 +; CI-NEXT: .LBB12_18: +; CI-NEXT: ; implicit-def: $vgpr10 +; CI-NEXT: .LBB12_19: ; %frem.compute46 +; CI-NEXT: v_frexp_mant_f32_e64 v11, |v6| +; CI-NEXT: v_ldexp_f32_e64 v11, v11, 1 +; CI-NEXT: v_div_scale_f32 v17, s[2:3], v11, v11, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v10, |v2| +; CI-NEXT: v_frexp_exp_i32_f32_e32 v16, v6 +; CI-NEXT: v_ldexp_f32_e64 v13, v10, 12 +; CI-NEXT: v_add_i32_e32 v10, vcc, -1, v16 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v15, v2 +; CI-NEXT: v_not_b32_e32 v12, v10 +; CI-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CI-NEXT: v_div_scale_f32 v14, vcc, 1.0, v11, 1.0 +; CI-NEXT: v_rcp_f32_e32 v18, v17 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 -; CI-NEXT: v_fma_f32 v8, v9, v8, v8 -; CI-NEXT: v_mul_f32_e32 v9, v6, v8 -; CI-NEXT: v_fma_f32 v10, -v7, v9, v6 -; CI-NEXT: v_fma_f32 v9, v10, v8, v9 -; CI-NEXT: v_fma_f32 v6, -v7, v9, v6 +; CI-NEXT: v_fma_f32 v19, -v17, v18, 1.0 +; CI-NEXT: v_fma_f32 v18, v19, v18, v18 +; CI-NEXT: v_mul_f32_e32 v19, v14, v18 +; CI-NEXT: v_fma_f32 v20, -v17, v19, v14 +; CI-NEXT: v_fma_f32 v19, v20, v18, v19 +; CI-NEXT: v_fma_f32 v14, -v17, v19, v14 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 -; CI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 -; CI-NEXT: v_trunc_f32_e32 v6, v6 -; CI-NEXT: v_fma_f32 v1, -v6, v5, v1 -; CI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 -; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 -; CI-NEXT: v_rcp_f32_e32 v7, v6 +; CI-NEXT: v_div_fmas_f32 v14, v14, v18, v19 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v12 +; CI-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB12_23 +; CI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; CI-NEXT: v_sub_i32_e32 v12, vcc, v15, v16 +; CI-NEXT: v_add_i32_e32 v12, vcc, 12, v12 +; CI-NEXT: .LBB12_21: ; %frem.loop_body54 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v15, v13 +; CI-NEXT: v_mul_f32_e32 v13, v15, v14 +; CI-NEXT: v_rndne_f32_e32 v13, v13 +; CI-NEXT: v_fma_f32 v13, -v13, v11, v15 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v13 +; CI-NEXT: v_add_f32_e32 v16, v13, v11 +; CI-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc +; CI-NEXT: v_add_i32_e32 v12, vcc, -12, v12 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v12 +; CI-NEXT: v_ldexp_f32_e64 v13, v13, 12 +; CI-NEXT: s_cbranch_vccnz .LBB12_21 +; CI-NEXT: ; %bb.22: ; %Flow117 +; CI-NEXT: v_mov_b32_e32 v13, v15 +; CI-NEXT: .LBB12_23: ; %frem.loop_exit55 +; CI-NEXT: v_add_i32_e32 v12, vcc, -11, v12 +; CI-NEXT: v_ldexp_f32_e32 v12, v13, v12 +; CI-NEXT: v_mul_f32_e32 v13, v12, v14 +; CI-NEXT: v_rndne_f32_e32 v13, v13 +; CI-NEXT: v_fma_f32 v12, -v13, v11, v12 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 +; CI-NEXT: v_add_f32_e32 v11, v12, v11 +; CI-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc +; CI-NEXT: v_ldexp_f32_e32 v10, v11, v10 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_bfi_b32 v10, s2, v10, v2 +; CI-NEXT: .LBB12_24: +; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7| +; CI-NEXT: s_and_b64 vcc, exec, s[2:3] +; CI-NEXT: s_cbranch_vccz .LBB12_26 +; CI-NEXT: ; %bb.25: ; %frem.else78 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_bfi_b32 v11, s2, 0, v3 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7| +; CI-NEXT: v_cndmask_b32_e32 v11, v3, v11, vcc +; CI-NEXT: s_cbranch_execz .LBB12_27 +; CI-NEXT: s_branch .LBB12_32 +; CI-NEXT: .LBB12_26: +; CI-NEXT: ; implicit-def: $vgpr11 +; CI-NEXT: .LBB12_27: ; %frem.compute77 +; CI-NEXT: v_frexp_mant_f32_e64 v12, |v7| +; CI-NEXT: v_ldexp_f32_e64 v12, v12, 1 +; CI-NEXT: v_div_scale_f32 v18, s[2:3], v12, v12, 1.0 +; CI-NEXT: v_frexp_mant_f32_e64 v11, |v3| +; CI-NEXT: v_frexp_exp_i32_f32_e32 v17, v7 +; CI-NEXT: v_ldexp_f32_e64 v14, v11, 12 +; CI-NEXT: v_add_i32_e32 v11, vcc, -1, v17 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v16, v3 +; CI-NEXT: v_not_b32_e32 v13, v11 +; CI-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CI-NEXT: v_div_scale_f32 v15, vcc, 1.0, v12, 1.0 +; CI-NEXT: v_rcp_f32_e32 v19, v18 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; CI-NEXT: v_fma_f32 v7, v8, v7, v7 -; CI-NEXT: v_mul_f32_e32 v8, v5, v7 -; CI-NEXT: v_fma_f32 v9, -v6, v8, v5 -; CI-NEXT: v_fma_f32 v8, v9, v7, v8 -; CI-NEXT: v_fma_f32 v5, -v6, v8, v5 +; CI-NEXT: v_fma_f32 v20, -v18, v19, 1.0 +; CI-NEXT: v_fma_f32 v19, v20, v19, v19 +; CI-NEXT: v_mul_f32_e32 v20, v15, v19 +; CI-NEXT: v_fma_f32 v21, -v18, v20, v15 +; CI-NEXT: v_fma_f32 v20, v21, v19, v20 +; CI-NEXT: v_fma_f32 v15, -v18, v20, v15 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 -; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 -; CI-NEXT: v_trunc_f32_e32 v5, v5 -; CI-NEXT: v_fma_f32 v0, -v5, v4, v0 +; CI-NEXT: v_div_fmas_f32 v15, v15, v19, v20 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v13 +; CI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB12_31 +; CI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; CI-NEXT: v_sub_i32_e32 v13, vcc, v16, v17 +; CI-NEXT: v_add_i32_e32 v13, vcc, 12, v13 +; CI-NEXT: .LBB12_29: ; %frem.loop_body85 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v16, v14 +; CI-NEXT: v_mul_f32_e32 v14, v16, v15 +; CI-NEXT: v_rndne_f32_e32 v14, v14 +; CI-NEXT: v_fma_f32 v14, -v14, v12, v16 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v14 +; CI-NEXT: v_add_f32_e32 v17, v14, v12 +; CI-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc +; CI-NEXT: v_add_i32_e32 v13, vcc, -12, v13 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v13 +; CI-NEXT: v_ldexp_f32_e64 v14, v14, 12 +; CI-NEXT: s_cbranch_vccnz .LBB12_29 +; CI-NEXT: ; %bb.30: ; %Flow +; CI-NEXT: v_mov_b32_e32 v14, v16 +; CI-NEXT: .LBB12_31: ; %frem.loop_exit86 +; CI-NEXT: v_add_i32_e32 v13, vcc, -11, v13 +; CI-NEXT: v_ldexp_f32_e32 v13, v14, v13 +; CI-NEXT: v_mul_f32_e32 v14, v13, v15 +; CI-NEXT: v_rndne_f32_e32 v14, v14 +; CI-NEXT: v_fma_f32 v13, -v14, v12, v13 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v13 +; CI-NEXT: v_add_f32_e32 v12, v13, v12 +; CI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; CI-NEXT: v_ldexp_f32_e32 v11, v12, v11 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_bfi_b32 v11, s2, v11, v3 +; CI-NEXT: .LBB12_32: ; %Flow116 +; CI-NEXT: s_mov_b32 s4, 0x7f800000 +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v4 +; CI-NEXT: v_cmp_nge_f32_e64 s[2:3], |v0|, s4 +; CI-NEXT: s_and_b64 vcc, s[2:3], vcc +; CI-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v5 +; CI-NEXT: v_cmp_nge_f32_e64 s[2:3], |v1|, s4 +; CI-NEXT: s_and_b64 vcc, s[2:3], vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v6 +; CI-NEXT: v_cmp_nge_f32_e64 s[2:3], |v2|, s4 +; CI-NEXT: s_and_b64 vcc, s[2:3], vcc +; CI-NEXT: v_cndmask_b32_e32 v2, v4, v10, vcc +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v7 +; CI-NEXT: v_cmp_nge_f32_e64 s[4:5], |v3|, s4 +; CI-NEXT: s_and_b64 vcc, s[4:5], vcc +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_cndmask_b32_e32 v3, v4, v11, vcc ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; @@ -4666,78 +13794,308 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v8, s0 -; VI-NEXT: s_add_u32 s0, s4, 64 -; VI-NEXT: v_mov_b32_e32 v9, s1 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_add_u32 s2, s4, 64 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_addc_u32 s3, s5, 0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3 -; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3 -; VI-NEXT: v_rcp_f32_e32 v12, v11 +; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4| +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] +; VI-NEXT: s_cbranch_vccz .LBB12_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_bfi_b32 v8, s2, 0, v0 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4| +; VI-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc +; VI-NEXT: s_cbranch_execz .LBB12_3 +; VI-NEXT: s_branch .LBB12_8 +; VI-NEXT: .LBB12_2: +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: .LBB12_3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f32_e64 v9, |v4| +; VI-NEXT: v_ldexp_f32 v9, v9, 1 +; VI-NEXT: v_div_scale_f32 v15, s[2:3], v9, v9, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v8, |v0| +; VI-NEXT: v_frexp_exp_i32_f32_e32 v14, v4 +; VI-NEXT: v_ldexp_f32 v11, v8, 12 +; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v14 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v13, v0 +; VI-NEXT: v_not_b32_e32 v10, v8 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v13 +; VI-NEXT: v_div_scale_f32 v12, vcc, 1.0, v9, 1.0 +; VI-NEXT: v_rcp_f32_e32 v16, v15 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 -; VI-NEXT: v_fma_f32 v12, v13, v12, v12 -; VI-NEXT: v_mul_f32_e32 v13, v10, v12 -; VI-NEXT: v_fma_f32 v14, -v11, v13, v10 -; VI-NEXT: v_fma_f32 v13, v14, v12, v13 -; VI-NEXT: v_fma_f32 v10, -v11, v13, v10 +; VI-NEXT: v_fma_f32 v17, -v15, v16, 1.0 +; VI-NEXT: v_fma_f32 v16, v17, v16, v16 +; VI-NEXT: v_mul_f32_e32 v17, v12, v16 +; VI-NEXT: v_fma_f32 v18, -v15, v17, v12 +; VI-NEXT: v_fma_f32 v17, v18, v16, v17 +; VI-NEXT: v_fma_f32 v12, -v15, v17, v12 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v10, v10, v12, v13 -; VI-NEXT: v_div_fixup_f32 v10, v10, v7, v3 -; VI-NEXT: v_trunc_f32_e32 v10, v10 -; VI-NEXT: v_fma_f32 v3, -v10, v7, v3 -; VI-NEXT: v_div_scale_f32 v10, s[0:1], v6, v6, v2 -; VI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 -; VI-NEXT: v_rcp_f32_e32 v11, v10 +; VI-NEXT: v_div_fmas_f32 v12, v12, v16, v17 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v10 +; VI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB12_7 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_sub_u32_e32 v10, vcc, v13, v14 +; VI-NEXT: v_add_u32_e32 v10, vcc, 12, v10 +; VI-NEXT: .LBB12_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v13, v11 +; VI-NEXT: v_mul_f32_e32 v11, v13, v12 +; VI-NEXT: v_rndne_f32_e32 v11, v11 +; VI-NEXT: v_fma_f32 v11, -v11, v9, v13 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 +; VI-NEXT: v_add_f32_e32 v14, v11, v9 +; VI-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, -12, v10 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v10 +; VI-NEXT: v_ldexp_f32 v11, v11, 12 +; VI-NEXT: s_cbranch_vccnz .LBB12_5 +; VI-NEXT: ; %bb.6: ; %Flow125 +; VI-NEXT: v_mov_b32_e32 v11, v13 +; VI-NEXT: .LBB12_7: ; %frem.loop_exit +; VI-NEXT: v_add_u32_e32 v10, vcc, -11, v10 +; VI-NEXT: v_ldexp_f32 v10, v11, v10 +; VI-NEXT: v_mul_f32_e32 v11, v10, v12 +; VI-NEXT: v_rndne_f32_e32 v11, v11 +; VI-NEXT: v_fma_f32 v10, -v11, v9, v10 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 +; VI-NEXT: v_add_f32_e32 v9, v10, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; VI-NEXT: v_ldexp_f32 v8, v9, v8 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_bfi_b32 v8, s2, v8, v0 +; VI-NEXT: .LBB12_8: +; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5| +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] +; VI-NEXT: s_cbranch_vccz .LBB12_10 +; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_bfi_b32 v9, s2, 0, v1 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5| +; VI-NEXT: v_cndmask_b32_e32 v9, v1, v9, vcc +; VI-NEXT: s_cbranch_execz .LBB12_11 +; VI-NEXT: s_branch .LBB12_16 +; VI-NEXT: .LBB12_10: +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: .LBB12_11: ; %frem.compute15 +; VI-NEXT: v_frexp_mant_f32_e64 v10, |v5| +; VI-NEXT: v_ldexp_f32 v10, v10, 1 +; VI-NEXT: v_div_scale_f32 v16, s[2:3], v10, v10, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v9, |v1| +; VI-NEXT: v_frexp_exp_i32_f32_e32 v15, v5 +; VI-NEXT: v_ldexp_f32 v12, v9, 12 +; VI-NEXT: v_add_u32_e32 v9, vcc, -1, v15 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v14, v1 +; VI-NEXT: v_not_b32_e32 v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v14 +; VI-NEXT: v_div_scale_f32 v13, vcc, 1.0, v10, 1.0 +; VI-NEXT: v_rcp_f32_e32 v17, v16 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v12, -v10, v11, 1.0 -; VI-NEXT: v_fma_f32 v11, v12, v11, v11 -; VI-NEXT: v_mul_f32_e32 v12, v7, v11 -; VI-NEXT: v_fma_f32 v13, -v10, v12, v7 -; VI-NEXT: v_fma_f32 v12, v13, v11, v12 -; VI-NEXT: v_fma_f32 v7, -v10, v12, v7 +; VI-NEXT: v_fma_f32 v18, -v16, v17, 1.0 +; VI-NEXT: v_fma_f32 v17, v18, v17, v17 +; VI-NEXT: v_mul_f32_e32 v18, v13, v17 +; VI-NEXT: v_fma_f32 v19, -v16, v18, v13 +; VI-NEXT: v_fma_f32 v18, v19, v17, v18 +; VI-NEXT: v_fma_f32 v13, -v16, v18, v13 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v7, v7, v11, v12 -; VI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 -; VI-NEXT: v_trunc_f32_e32 v7, v7 -; VI-NEXT: v_fma_f32 v2, -v7, v6, v2 -; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 -; VI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 -; VI-NEXT: v_rcp_f32_e32 v10, v7 +; VI-NEXT: v_div_fmas_f32 v13, v13, v17, v18 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v11 +; VI-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB12_15 +; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: v_sub_u32_e32 v11, vcc, v14, v15 +; VI-NEXT: v_add_u32_e32 v11, vcc, 12, v11 +; VI-NEXT: .LBB12_13: ; %frem.loop_body23 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v14, v12 +; VI-NEXT: v_mul_f32_e32 v12, v14, v13 +; VI-NEXT: v_rndne_f32_e32 v12, v12 +; VI-NEXT: v_fma_f32 v12, -v12, v10, v14 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 +; VI-NEXT: v_add_f32_e32 v15, v12, v10 +; VI-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, -12, v11 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v11 +; VI-NEXT: v_ldexp_f32 v12, v12, 12 +; VI-NEXT: s_cbranch_vccnz .LBB12_13 +; VI-NEXT: ; %bb.14: ; %Flow121 +; VI-NEXT: v_mov_b32_e32 v12, v14 +; VI-NEXT: .LBB12_15: ; %frem.loop_exit24 +; VI-NEXT: v_add_u32_e32 v11, vcc, -11, v11 +; VI-NEXT: v_ldexp_f32 v11, v12, v11 +; VI-NEXT: v_mul_f32_e32 v12, v11, v13 +; VI-NEXT: v_rndne_f32_e32 v12, v12 +; VI-NEXT: v_fma_f32 v11, -v12, v10, v11 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 +; VI-NEXT: v_add_f32_e32 v10, v11, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; VI-NEXT: v_ldexp_f32 v9, v10, v9 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_bfi_b32 v9, s2, v9, v1 +; VI-NEXT: .LBB12_16: +; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6| +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] +; VI-NEXT: s_cbranch_vccz .LBB12_18 +; VI-NEXT: ; %bb.17: ; %frem.else47 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_bfi_b32 v10, s2, 0, v2 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6| +; VI-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc +; VI-NEXT: s_cbranch_execz .LBB12_19 +; VI-NEXT: s_branch .LBB12_24 +; VI-NEXT: .LBB12_18: +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: .LBB12_19: ; %frem.compute46 +; VI-NEXT: v_frexp_mant_f32_e64 v11, |v6| +; VI-NEXT: v_ldexp_f32 v11, v11, 1 +; VI-NEXT: v_div_scale_f32 v17, s[2:3], v11, v11, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v10, |v2| +; VI-NEXT: v_frexp_exp_i32_f32_e32 v16, v6 +; VI-NEXT: v_ldexp_f32 v13, v10, 12 +; VI-NEXT: v_add_u32_e32 v10, vcc, -1, v16 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v15, v2 +; VI-NEXT: v_not_b32_e32 v12, v10 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v15 +; VI-NEXT: v_div_scale_f32 v14, vcc, 1.0, v11, 1.0 +; VI-NEXT: v_rcp_f32_e32 v18, v17 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v11, -v7, v10, 1.0 -; VI-NEXT: v_fma_f32 v10, v11, v10, v10 -; VI-NEXT: v_mul_f32_e32 v11, v6, v10 -; VI-NEXT: v_fma_f32 v12, -v7, v11, v6 -; VI-NEXT: v_fma_f32 v11, v12, v10, v11 -; VI-NEXT: v_fma_f32 v6, -v7, v11, v6 +; VI-NEXT: v_fma_f32 v19, -v17, v18, 1.0 +; VI-NEXT: v_fma_f32 v18, v19, v18, v18 +; VI-NEXT: v_mul_f32_e32 v19, v14, v18 +; VI-NEXT: v_fma_f32 v20, -v17, v19, v14 +; VI-NEXT: v_fma_f32 v19, v20, v18, v19 +; VI-NEXT: v_fma_f32 v14, -v17, v19, v14 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 -; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 -; VI-NEXT: v_trunc_f32_e32 v6, v6 -; VI-NEXT: v_fma_f32 v1, -v6, v5, v1 -; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 -; VI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 -; VI-NEXT: v_rcp_f32_e32 v7, v6 +; VI-NEXT: v_div_fmas_f32 v14, v14, v18, v19 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v12 +; VI-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB12_23 +; VI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; VI-NEXT: v_sub_u32_e32 v12, vcc, v15, v16 +; VI-NEXT: v_add_u32_e32 v12, vcc, 12, v12 +; VI-NEXT: .LBB12_21: ; %frem.loop_body54 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v15, v13 +; VI-NEXT: v_mul_f32_e32 v13, v15, v14 +; VI-NEXT: v_rndne_f32_e32 v13, v13 +; VI-NEXT: v_fma_f32 v13, -v13, v11, v15 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v13 +; VI-NEXT: v_add_f32_e32 v16, v13, v11 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, -12, v12 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v12 +; VI-NEXT: v_ldexp_f32 v13, v13, 12 +; VI-NEXT: s_cbranch_vccnz .LBB12_21 +; VI-NEXT: ; %bb.22: ; %Flow117 +; VI-NEXT: v_mov_b32_e32 v13, v15 +; VI-NEXT: .LBB12_23: ; %frem.loop_exit55 +; VI-NEXT: v_add_u32_e32 v12, vcc, -11, v12 +; VI-NEXT: v_ldexp_f32 v12, v13, v12 +; VI-NEXT: v_mul_f32_e32 v13, v12, v14 +; VI-NEXT: v_rndne_f32_e32 v13, v13 +; VI-NEXT: v_fma_f32 v12, -v13, v11, v12 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 +; VI-NEXT: v_add_f32_e32 v11, v12, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc +; VI-NEXT: v_ldexp_f32 v10, v11, v10 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_bfi_b32 v10, s2, v10, v2 +; VI-NEXT: .LBB12_24: +; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7| +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] +; VI-NEXT: s_cbranch_vccz .LBB12_26 +; VI-NEXT: ; %bb.25: ; %frem.else78 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_bfi_b32 v11, s2, 0, v3 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7| +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v11, vcc +; VI-NEXT: s_cbranch_execz .LBB12_27 +; VI-NEXT: s_branch .LBB12_32 +; VI-NEXT: .LBB12_26: +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: .LBB12_27: ; %frem.compute77 +; VI-NEXT: v_frexp_mant_f32_e64 v12, |v7| +; VI-NEXT: v_ldexp_f32 v12, v12, 1 +; VI-NEXT: v_div_scale_f32 v18, s[2:3], v12, v12, 1.0 +; VI-NEXT: v_frexp_mant_f32_e64 v11, |v3| +; VI-NEXT: v_frexp_exp_i32_f32_e32 v17, v7 +; VI-NEXT: v_ldexp_f32 v14, v11, 12 +; VI-NEXT: v_add_u32_e32 v11, vcc, -1, v17 +; VI-NEXT: v_frexp_exp_i32_f32_e32 v16, v3 +; VI-NEXT: v_not_b32_e32 v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v16 +; VI-NEXT: v_div_scale_f32 v15, vcc, 1.0, v12, 1.0 +; VI-NEXT: v_rcp_f32_e32 v19, v18 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v10, -v6, v7, 1.0 -; VI-NEXT: v_fma_f32 v7, v10, v7, v7 -; VI-NEXT: v_mul_f32_e32 v10, v5, v7 -; VI-NEXT: v_fma_f32 v11, -v6, v10, v5 -; VI-NEXT: v_fma_f32 v10, v11, v7, v10 -; VI-NEXT: v_fma_f32 v5, -v6, v10, v5 +; VI-NEXT: v_fma_f32 v20, -v18, v19, 1.0 +; VI-NEXT: v_fma_f32 v19, v20, v19, v19 +; VI-NEXT: v_mul_f32_e32 v20, v15, v19 +; VI-NEXT: v_fma_f32 v21, -v18, v20, v15 +; VI-NEXT: v_fma_f32 v20, v21, v19, v20 +; VI-NEXT: v_fma_f32 v15, -v18, v20, v15 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v10 -; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 -; VI-NEXT: v_trunc_f32_e32 v5, v5 -; VI-NEXT: v_fma_f32 v0, -v5, v4, v0 -; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; VI-NEXT: v_div_fmas_f32 v15, v15, v19, v20 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v13 +; VI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB12_31 +; VI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; VI-NEXT: v_sub_u32_e32 v13, vcc, v16, v17 +; VI-NEXT: v_add_u32_e32 v13, vcc, 12, v13 +; VI-NEXT: .LBB12_29: ; %frem.loop_body85 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v16, v14 +; VI-NEXT: v_mul_f32_e32 v14, v16, v15 +; VI-NEXT: v_rndne_f32_e32 v14, v14 +; VI-NEXT: v_fma_f32 v14, -v14, v12, v16 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v14 +; VI-NEXT: v_add_f32_e32 v17, v14, v12 +; VI-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, -12, v13 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v13 +; VI-NEXT: v_ldexp_f32 v14, v14, 12 +; VI-NEXT: s_cbranch_vccnz .LBB12_29 +; VI-NEXT: ; %bb.30: ; %Flow +; VI-NEXT: v_mov_b32_e32 v14, v16 +; VI-NEXT: .LBB12_31: ; %frem.loop_exit86 +; VI-NEXT: v_add_u32_e32 v13, vcc, -11, v13 +; VI-NEXT: v_ldexp_f32 v13, v14, v13 +; VI-NEXT: v_mul_f32_e32 v14, v13, v15 +; VI-NEXT: v_rndne_f32_e32 v14, v14 +; VI-NEXT: v_fma_f32 v13, -v14, v12, v13 +; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v13 +; VI-NEXT: v_add_f32_e32 v12, v13, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; VI-NEXT: v_ldexp_f32 v11, v12, v11 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_bfi_b32 v11, s2, v11, v3 +; VI-NEXT: .LBB12_32: ; %Flow116 +; VI-NEXT: s_mov_b32 s4, 0x7f800000 +; VI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_nge_f32_e64 s[2:3], |v0|, s4 +; VI-NEXT: s_and_b64 vcc, s[2:3], vcc +; VI-NEXT: v_mov_b32_e32 v12, 0x7fc00000 +; VI-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc +; VI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v5 +; VI-NEXT: v_cmp_nge_f32_e64 s[2:3], |v1|, s4 +; VI-NEXT: s_and_b64 vcc, s[2:3], vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v12, v9, vcc +; VI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v6 +; VI-NEXT: v_cmp_nge_f32_e64 s[2:3], |v2|, s4 +; VI-NEXT: s_and_b64 vcc, s[2:3], vcc +; VI-NEXT: v_cndmask_b32_e32 v2, v12, v10, vcc +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v7 +; VI-NEXT: v_cmp_nge_f32_e64 s[0:1], |v3|, s4 +; VI-NEXT: s_and_b64 vcc, s[0:1], vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v12, v11, vcc +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: frem_v4f32: @@ -4749,67 +14107,298 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f32 v10, s[2:3], v7, v7, v3 -; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3 -; GFX9-NEXT: v_rcp_f32_e32 v11, v10 +; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4| +; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX9-NEXT: s_cbranch_vccz .LBB12_2 +; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_bfi_b32 v8, s2, 0, v0 +; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4| +; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc +; GFX9-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-NEXT: s_branch .LBB12_8 +; GFX9-NEXT: .LBB12_2: +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: .LBB12_3: ; %frem.compute +; GFX9-NEXT: v_frexp_mant_f32_e64 v9, |v4| +; GFX9-NEXT: v_ldexp_f32 v9, v9, 1 +; GFX9-NEXT: v_div_scale_f32 v15, s[2:3], v9, v9, 1.0 +; GFX9-NEXT: v_div_scale_f32 v12, vcc, 1.0, v9, 1.0 +; GFX9-NEXT: v_frexp_mant_f32_e64 v8, |v0| +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v13, v0 +; GFX9-NEXT: v_ldexp_f32 v11, v8, 12 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v14, v4 +; GFX9-NEXT: v_add_u32_e32 v8, -1, v14 +; GFX9-NEXT: v_not_b32_e32 v10, v8 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v13 +; GFX9-NEXT: v_rcp_f32_e32 v16, v15 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX9-NEXT: v_fma_f32 v12, -v10, v11, 1.0 -; GFX9-NEXT: v_fma_f32 v11, v12, v11, v11 -; GFX9-NEXT: v_mul_f32_e32 v12, v9, v11 -; GFX9-NEXT: v_fma_f32 v13, -v10, v12, v9 -; GFX9-NEXT: v_fma_f32 v12, v13, v11, v12 -; GFX9-NEXT: v_fma_f32 v9, -v10, v12, v9 +; GFX9-NEXT: v_fma_f32 v17, -v15, v16, 1.0 +; GFX9-NEXT: v_fma_f32 v16, v17, v16, v16 +; GFX9-NEXT: v_mul_f32_e32 v17, v12, v16 +; GFX9-NEXT: v_fma_f32 v18, -v15, v17, v12 +; GFX9-NEXT: v_fma_f32 v17, v18, v16, v17 +; GFX9-NEXT: v_fma_f32 v12, -v15, v17, v12 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX9-NEXT: v_div_fmas_f32 v9, v9, v11, v12 -; GFX9-NEXT: v_div_fixup_f32 v9, v9, v7, v3 -; GFX9-NEXT: v_trunc_f32_e32 v9, v9 -; GFX9-NEXT: v_fma_f32 v3, -v9, v7, v3 -; GFX9-NEXT: v_div_scale_f32 v9, s[2:3], v6, v6, v2 -; GFX9-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 -; GFX9-NEXT: v_rcp_f32_e32 v10, v9 +; GFX9-NEXT: v_div_fmas_f32 v12, v12, v16, v17 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v10 +; GFX9-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB12_7 +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: v_sub_u32_e32 v10, v13, v14 +; GFX9-NEXT: v_add_u32_e32 v10, 12, v10 +; GFX9-NEXT: .LBB12_5: ; %frem.loop_body +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-NEXT: v_mul_f32_e32 v11, v13, v12 +; GFX9-NEXT: v_rndne_f32_e32 v11, v11 +; GFX9-NEXT: v_fma_f32 v11, -v11, v9, v13 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 +; GFX9-NEXT: v_add_f32_e32 v14, v11, v9 +; GFX9-NEXT: v_add_u32_e32 v10, -12, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 12, v10 +; GFX9-NEXT: v_ldexp_f32 v11, v11, 12 +; GFX9-NEXT: s_cbranch_vccnz .LBB12_5 +; GFX9-NEXT: ; %bb.6: ; %Flow125 +; GFX9-NEXT: v_mov_b32_e32 v11, v13 +; GFX9-NEXT: .LBB12_7: ; %frem.loop_exit +; GFX9-NEXT: v_add_u32_e32 v10, -11, v10 +; GFX9-NEXT: v_ldexp_f32 v10, v11, v10 +; GFX9-NEXT: v_mul_f32_e32 v11, v10, v12 +; GFX9-NEXT: v_rndne_f32_e32 v11, v11 +; GFX9-NEXT: v_fma_f32 v10, -v11, v9, v10 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, v10, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; GFX9-NEXT: v_ldexp_f32 v8, v9, v8 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_bfi_b32 v8, s2, v8, v0 +; GFX9-NEXT: .LBB12_8: +; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5| +; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX9-NEXT: s_cbranch_vccz .LBB12_10 +; GFX9-NEXT: ; %bb.9: ; %frem.else16 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_bfi_b32 v9, s2, 0, v1 +; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5| +; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v9, vcc +; GFX9-NEXT: s_cbranch_execz .LBB12_11 +; GFX9-NEXT: s_branch .LBB12_16 +; GFX9-NEXT: .LBB12_10: +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: .LBB12_11: ; %frem.compute15 +; GFX9-NEXT: v_frexp_mant_f32_e64 v10, |v5| +; GFX9-NEXT: v_ldexp_f32 v10, v10, 1 +; GFX9-NEXT: v_div_scale_f32 v16, s[2:3], v10, v10, 1.0 +; GFX9-NEXT: v_div_scale_f32 v13, vcc, 1.0, v10, 1.0 +; GFX9-NEXT: v_frexp_mant_f32_e64 v9, |v1| +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v14, v1 +; GFX9-NEXT: v_ldexp_f32 v12, v9, 12 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v15, v5 +; GFX9-NEXT: v_add_u32_e32 v9, -1, v15 +; GFX9-NEXT: v_not_b32_e32 v11, v9 +; GFX9-NEXT: v_add_u32_e32 v11, v11, v14 +; GFX9-NEXT: v_rcp_f32_e32 v17, v16 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0 -; GFX9-NEXT: v_fma_f32 v10, v11, v10, v10 -; GFX9-NEXT: v_mul_f32_e32 v11, v7, v10 -; GFX9-NEXT: v_fma_f32 v12, -v9, v11, v7 -; GFX9-NEXT: v_fma_f32 v11, v12, v10, v11 -; GFX9-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX9-NEXT: v_fma_f32 v18, -v16, v17, 1.0 +; GFX9-NEXT: v_fma_f32 v17, v18, v17, v17 +; GFX9-NEXT: v_mul_f32_e32 v18, v13, v17 +; GFX9-NEXT: v_fma_f32 v19, -v16, v18, v13 +; GFX9-NEXT: v_fma_f32 v18, v19, v17, v18 +; GFX9-NEXT: v_fma_f32 v13, -v16, v18, v13 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX9-NEXT: v_div_fmas_f32 v7, v7, v10, v11 -; GFX9-NEXT: v_div_fixup_f32 v7, v7, v6, v2 -; GFX9-NEXT: v_trunc_f32_e32 v7, v7 -; GFX9-NEXT: v_fma_f32 v2, -v7, v6, v2 -; GFX9-NEXT: v_div_scale_f32 v7, s[2:3], v5, v5, v1 -; GFX9-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 -; GFX9-NEXT: v_rcp_f32_e32 v9, v7 +; GFX9-NEXT: v_div_fmas_f32 v13, v13, v17, v18 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v11 +; GFX9-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB12_15 +; GFX9-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX9-NEXT: v_sub_u32_e32 v11, v14, v15 +; GFX9-NEXT: v_add_u32_e32 v11, 12, v11 +; GFX9-NEXT: .LBB12_13: ; %frem.loop_body23 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v14, v12 +; GFX9-NEXT: v_mul_f32_e32 v12, v14, v13 +; GFX9-NEXT: v_rndne_f32_e32 v12, v12 +; GFX9-NEXT: v_fma_f32 v12, -v12, v10, v14 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 +; GFX9-NEXT: v_add_f32_e32 v15, v12, v10 +; GFX9-NEXT: v_add_u32_e32 v11, -12, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 12, v11 +; GFX9-NEXT: v_ldexp_f32 v12, v12, 12 +; GFX9-NEXT: s_cbranch_vccnz .LBB12_13 +; GFX9-NEXT: ; %bb.14: ; %Flow121 +; GFX9-NEXT: v_mov_b32_e32 v12, v14 +; GFX9-NEXT: .LBB12_15: ; %frem.loop_exit24 +; GFX9-NEXT: v_add_u32_e32 v11, -11, v11 +; GFX9-NEXT: v_ldexp_f32 v11, v12, v11 +; GFX9-NEXT: v_mul_f32_e32 v12, v11, v13 +; GFX9-NEXT: v_rndne_f32_e32 v12, v12 +; GFX9-NEXT: v_fma_f32 v11, -v12, v10, v11 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, v11, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; GFX9-NEXT: v_ldexp_f32 v9, v10, v9 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_bfi_b32 v9, s2, v9, v1 +; GFX9-NEXT: .LBB12_16: +; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6| +; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX9-NEXT: s_cbranch_vccz .LBB12_18 +; GFX9-NEXT: ; %bb.17: ; %frem.else47 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_bfi_b32 v10, s2, 0, v2 +; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6| +; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc +; GFX9-NEXT: s_cbranch_execz .LBB12_19 +; GFX9-NEXT: s_branch .LBB12_24 +; GFX9-NEXT: .LBB12_18: +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: .LBB12_19: ; %frem.compute46 +; GFX9-NEXT: v_frexp_mant_f32_e64 v11, |v6| +; GFX9-NEXT: v_ldexp_f32 v11, v11, 1 +; GFX9-NEXT: v_div_scale_f32 v17, s[2:3], v11, v11, 1.0 +; GFX9-NEXT: v_div_scale_f32 v14, vcc, 1.0, v11, 1.0 +; GFX9-NEXT: v_frexp_mant_f32_e64 v10, |v2| +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v15, v2 +; GFX9-NEXT: v_ldexp_f32 v13, v10, 12 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v16, v6 +; GFX9-NEXT: v_add_u32_e32 v10, -1, v16 +; GFX9-NEXT: v_not_b32_e32 v12, v10 +; GFX9-NEXT: v_add_u32_e32 v12, v12, v15 +; GFX9-NEXT: v_rcp_f32_e32 v18, v17 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX9-NEXT: v_fma_f32 v10, -v7, v9, 1.0 -; GFX9-NEXT: v_fma_f32 v9, v10, v9, v9 -; GFX9-NEXT: v_mul_f32_e32 v10, v6, v9 -; GFX9-NEXT: v_fma_f32 v11, -v7, v10, v6 -; GFX9-NEXT: v_fma_f32 v10, v11, v9, v10 -; GFX9-NEXT: v_fma_f32 v6, -v7, v10, v6 +; GFX9-NEXT: v_fma_f32 v19, -v17, v18, 1.0 +; GFX9-NEXT: v_fma_f32 v18, v19, v18, v18 +; GFX9-NEXT: v_mul_f32_e32 v19, v14, v18 +; GFX9-NEXT: v_fma_f32 v20, -v17, v19, v14 +; GFX9-NEXT: v_fma_f32 v19, v20, v18, v19 +; GFX9-NEXT: v_fma_f32 v14, -v17, v19, v14 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX9-NEXT: v_div_fmas_f32 v6, v6, v9, v10 -; GFX9-NEXT: v_div_fixup_f32 v6, v6, v5, v1 -; GFX9-NEXT: v_trunc_f32_e32 v6, v6 -; GFX9-NEXT: v_fma_f32 v1, -v6, v5, v1 -; GFX9-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, v0 -; GFX9-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 -; GFX9-NEXT: v_rcp_f32_e32 v7, v6 +; GFX9-NEXT: v_div_fmas_f32 v14, v14, v18, v19 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v12 +; GFX9-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB12_23 +; GFX9-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; GFX9-NEXT: v_sub_u32_e32 v12, v15, v16 +; GFX9-NEXT: v_add_u32_e32 v12, 12, v12 +; GFX9-NEXT: .LBB12_21: ; %frem.loop_body54 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v15, v13 +; GFX9-NEXT: v_mul_f32_e32 v13, v15, v14 +; GFX9-NEXT: v_rndne_f32_e32 v13, v13 +; GFX9-NEXT: v_fma_f32 v13, -v13, v11, v15 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v13 +; GFX9-NEXT: v_add_f32_e32 v16, v13, v11 +; GFX9-NEXT: v_add_u32_e32 v12, -12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 12, v12 +; GFX9-NEXT: v_ldexp_f32 v13, v13, 12 +; GFX9-NEXT: s_cbranch_vccnz .LBB12_21 +; GFX9-NEXT: ; %bb.22: ; %Flow117 +; GFX9-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-NEXT: .LBB12_23: ; %frem.loop_exit55 +; GFX9-NEXT: v_add_u32_e32 v12, -11, v12 +; GFX9-NEXT: v_ldexp_f32 v12, v13, v12 +; GFX9-NEXT: v_mul_f32_e32 v13, v12, v14 +; GFX9-NEXT: v_rndne_f32_e32 v13, v13 +; GFX9-NEXT: v_fma_f32 v12, -v13, v11, v12 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, v12, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc +; GFX9-NEXT: v_ldexp_f32 v10, v11, v10 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_bfi_b32 v10, s2, v10, v2 +; GFX9-NEXT: .LBB12_24: +; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7| +; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX9-NEXT: s_cbranch_vccz .LBB12_26 +; GFX9-NEXT: ; %bb.25: ; %frem.else78 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_bfi_b32 v11, s2, 0, v3 +; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7| +; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v11, vcc +; GFX9-NEXT: s_cbranch_execz .LBB12_27 +; GFX9-NEXT: s_branch .LBB12_32 +; GFX9-NEXT: .LBB12_26: +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: .LBB12_27: ; %frem.compute77 +; GFX9-NEXT: v_frexp_mant_f32_e64 v12, |v7| +; GFX9-NEXT: v_ldexp_f32 v12, v12, 1 +; GFX9-NEXT: v_div_scale_f32 v18, s[2:3], v12, v12, 1.0 +; GFX9-NEXT: v_div_scale_f32 v15, vcc, 1.0, v12, 1.0 +; GFX9-NEXT: v_frexp_mant_f32_e64 v11, |v3| +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v16, v3 +; GFX9-NEXT: v_ldexp_f32 v14, v11, 12 +; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v17, v7 +; GFX9-NEXT: v_add_u32_e32 v11, -1, v17 +; GFX9-NEXT: v_not_b32_e32 v13, v11 +; GFX9-NEXT: v_add_u32_e32 v13, v13, v16 +; GFX9-NEXT: v_rcp_f32_e32 v19, v18 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX9-NEXT: v_fma_f32 v9, -v6, v7, 1.0 -; GFX9-NEXT: v_fma_f32 v7, v9, v7, v7 -; GFX9-NEXT: v_mul_f32_e32 v9, v5, v7 -; GFX9-NEXT: v_fma_f32 v10, -v6, v9, v5 -; GFX9-NEXT: v_fma_f32 v9, v10, v7, v9 -; GFX9-NEXT: v_fma_f32 v5, -v6, v9, v5 +; GFX9-NEXT: v_fma_f32 v20, -v18, v19, 1.0 +; GFX9-NEXT: v_fma_f32 v19, v20, v19, v19 +; GFX9-NEXT: v_mul_f32_e32 v20, v15, v19 +; GFX9-NEXT: v_fma_f32 v21, -v18, v20, v15 +; GFX9-NEXT: v_fma_f32 v20, v21, v19, v20 +; GFX9-NEXT: v_fma_f32 v15, -v18, v20, v15 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v9 -; GFX9-NEXT: v_div_fixup_f32 v5, v5, v4, v0 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_fma_f32 v0, -v5, v4, v0 -; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX9-NEXT: v_div_fmas_f32 v15, v15, v19, v20 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v13 +; GFX9-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB12_31 +; GFX9-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; GFX9-NEXT: v_sub_u32_e32 v13, v16, v17 +; GFX9-NEXT: v_add_u32_e32 v13, 12, v13 +; GFX9-NEXT: .LBB12_29: ; %frem.loop_body85 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v16, v14 +; GFX9-NEXT: v_mul_f32_e32 v14, v16, v15 +; GFX9-NEXT: v_rndne_f32_e32 v14, v14 +; GFX9-NEXT: v_fma_f32 v14, -v14, v12, v16 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v14 +; GFX9-NEXT: v_add_f32_e32 v17, v14, v12 +; GFX9-NEXT: v_add_u32_e32 v13, -12, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 12, v13 +; GFX9-NEXT: v_ldexp_f32 v14, v14, 12 +; GFX9-NEXT: s_cbranch_vccnz .LBB12_29 +; GFX9-NEXT: ; %bb.30: ; %Flow +; GFX9-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-NEXT: .LBB12_31: ; %frem.loop_exit86 +; GFX9-NEXT: v_add_u32_e32 v13, -11, v13 +; GFX9-NEXT: v_ldexp_f32 v13, v14, v13 +; GFX9-NEXT: v_mul_f32_e32 v14, v13, v15 +; GFX9-NEXT: v_rndne_f32_e32 v14, v14 +; GFX9-NEXT: v_fma_f32 v13, -v14, v12, v13 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, v13, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; GFX9-NEXT: v_ldexp_f32 v11, v12, v11 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_bfi_b32 v11, s2, v11, v3 +; GFX9-NEXT: .LBB12_32: ; %Flow116 +; GFX9-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_nge_f32_e64 s[2:3], |v0|, s4 +; GFX9-NEXT: s_and_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc +; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, 0, v5 +; GFX9-NEXT: v_cmp_nge_f32_e64 s[2:3], |v1|, s4 +; GFX9-NEXT: s_and_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc +; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, 0, v6 +; GFX9-NEXT: v_cmp_nge_f32_e64 s[2:3], |v2|, s4 +; GFX9-NEXT: s_and_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v10, vcc +; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, 0, v7 +; GFX9-NEXT: v_cmp_nge_f32_e64 s[2:3], |v3|, s4 +; GFX9-NEXT: s_and_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v11, vcc +; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: frem_v4f32: @@ -4823,67 +14412,300 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v10, s2, v7, v7, v3 -; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 -; GFX10-NEXT: v_rcp_f32_e32 v11, v10 +; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v4| +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_vccz .LBB12_2 +; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: v_bfi_b32 v8, 0x7fffffff, 0, v0 +; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v4| +; GFX10-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB12_3 +; GFX10-NEXT: s_branch .LBB12_8 +; GFX10-NEXT: .LBB12_2: +; GFX10-NEXT: ; implicit-def: $vgpr8 +; GFX10-NEXT: .LBB12_3: ; %frem.compute +; GFX10-NEXT: v_frexp_mant_f32_e64 v9, |v4| +; GFX10-NEXT: v_frexp_mant_f32_e64 v8, |v0| +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v11, v0 +; GFX10-NEXT: v_ldexp_f32 v9, v9, 1 +; GFX10-NEXT: v_ldexp_f32 v10, v8, 12 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v8, v4 +; GFX10-NEXT: v_readfirstlane_b32 s2, v11 +; GFX10-NEXT: v_div_scale_f32 v13, s4, v9, v9, 1.0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v8 +; GFX10-NEXT: v_add_nc_u32_e32 v8, -1, v8 +; GFX10-NEXT: v_rcp_f32_e32 v14, v13 +; GFX10-NEXT: v_not_b32_e32 v12, v8 +; GFX10-NEXT: v_add_nc_u32_e32 v12, v12, v11 +; GFX10-NEXT: v_div_scale_f32 v11, vcc_lo, 1.0, v9, 1.0 ; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v12, -v10, v11, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v11 -; GFX10-NEXT: v_mul_f32_e32 v12, v9, v11 -; GFX10-NEXT: v_fma_f32 v13, -v10, v12, v9 -; GFX10-NEXT: v_fmac_f32_e32 v12, v13, v11 -; GFX10-NEXT: v_fma_f32 v9, -v10, v12, v9 +; GFX10-NEXT: v_fma_f32 v15, -v13, v14, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v14, v15, v14 +; GFX10-NEXT: v_mul_f32_e32 v15, v11, v14 +; GFX10-NEXT: v_fma_f32 v16, -v13, v15, v11 +; GFX10-NEXT: v_fmac_f32_e32 v15, v16, v14 +; GFX10-NEXT: v_fma_f32 v11, -v13, v15, v11 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12 -; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3 -; GFX10-NEXT: v_trunc_f32_e32 v9, v9 -; GFX10-NEXT: v_fma_f32 v3, -v9, v7, v3 -; GFX10-NEXT: v_div_scale_f32 v9, s2, v6, v6, v2 -; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 -; GFX10-NEXT: v_rcp_f32_e32 v10, v9 +; GFX10-NEXT: v_div_fmas_f32 v11, v11, v14, v15 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v12 +; GFX10-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 +; GFX10-NEXT: s_cbranch_vccnz .LBB12_7 +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s2, s2, 12 +; GFX10-NEXT: .LBB12_5: ; %frem.loop_body +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v13, v10 +; GFX10-NEXT: s_add_i32 s2, s2, -12 +; GFX10-NEXT: s_cmp_gt_i32 s2, 12 +; GFX10-NEXT: v_mul_f32_e32 v10, v13, v11 +; GFX10-NEXT: v_rndne_f32_e32 v10, v10 +; GFX10-NEXT: v_fma_f32 v10, -v10, v9, v13 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v10 +; GFX10-NEXT: v_add_f32_e32 v12, v10, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v10, v10, 12 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_5 +; GFX10-NEXT: ; %bb.6: ; %Flow125 +; GFX10-NEXT: v_mov_b32_e32 v12, s2 +; GFX10-NEXT: v_mov_b32_e32 v10, v13 +; GFX10-NEXT: .LBB12_7: ; %frem.loop_exit +; GFX10-NEXT: v_add_nc_u32_e32 v12, -11, v12 +; GFX10-NEXT: v_ldexp_f32 v10, v10, v12 +; GFX10-NEXT: v_mul_f32_e32 v11, v10, v11 +; GFX10-NEXT: v_rndne_f32_e32 v11, v11 +; GFX10-NEXT: v_fma_f32 v10, -v11, v9, v10 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v10 +; GFX10-NEXT: v_add_f32_e32 v9, v10, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v8, v9, v8 +; GFX10-NEXT: v_bfi_b32 v8, 0x7fffffff, v8, v0 +; GFX10-NEXT: .LBB12_8: +; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v5| +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_vccz .LBB12_10 +; GFX10-NEXT: ; %bb.9: ; %frem.else16 +; GFX10-NEXT: v_bfi_b32 v9, 0x7fffffff, 0, v1 +; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v5| +; GFX10-NEXT: v_cndmask_b32_e32 v9, v1, v9, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB12_11 +; GFX10-NEXT: s_branch .LBB12_16 +; GFX10-NEXT: .LBB12_10: +; GFX10-NEXT: ; implicit-def: $vgpr9 +; GFX10-NEXT: .LBB12_11: ; %frem.compute15 +; GFX10-NEXT: v_frexp_mant_f32_e64 v10, |v5| +; GFX10-NEXT: v_frexp_mant_f32_e64 v9, |v1| +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v12, v1 +; GFX10-NEXT: v_ldexp_f32 v10, v10, 1 +; GFX10-NEXT: v_ldexp_f32 v11, v9, 12 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v9, v5 +; GFX10-NEXT: v_readfirstlane_b32 s2, v12 +; GFX10-NEXT: v_div_scale_f32 v14, s4, v10, v10, 1.0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v9 +; GFX10-NEXT: v_add_nc_u32_e32 v9, -1, v9 +; GFX10-NEXT: v_rcp_f32_e32 v15, v14 +; GFX10-NEXT: v_not_b32_e32 v13, v9 +; GFX10-NEXT: v_add_nc_u32_e32 v13, v13, v12 +; GFX10-NEXT: v_div_scale_f32 v12, vcc_lo, 1.0, v10, 1.0 ; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v10 -; GFX10-NEXT: v_mul_f32_e32 v11, v7, v10 -; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v7 -; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v10 -; GFX10-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX10-NEXT: v_fma_f32 v16, -v14, v15, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v15, v16, v15 +; GFX10-NEXT: v_mul_f32_e32 v16, v12, v15 +; GFX10-NEXT: v_fma_f32 v17, -v14, v16, v12 +; GFX10-NEXT: v_fmac_f32_e32 v16, v17, v15 +; GFX10-NEXT: v_fma_f32 v12, -v14, v16, v12 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11 -; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2 -; GFX10-NEXT: v_trunc_f32_e32 v7, v7 -; GFX10-NEXT: v_fma_f32 v2, -v7, v6, v2 -; GFX10-NEXT: v_div_scale_f32 v7, s2, v5, v5, v1 -; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 -; GFX10-NEXT: v_rcp_f32_e32 v9, v7 +; GFX10-NEXT: v_div_fmas_f32 v12, v12, v15, v16 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v13 +; GFX10-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0 +; GFX10-NEXT: s_cbranch_vccnz .LBB12_15 +; GFX10-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s2, s2, 12 +; GFX10-NEXT: .LBB12_13: ; %frem.loop_body23 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v14, v11 +; GFX10-NEXT: s_add_i32 s2, s2, -12 +; GFX10-NEXT: s_cmp_gt_i32 s2, 12 +; GFX10-NEXT: v_mul_f32_e32 v11, v14, v12 +; GFX10-NEXT: v_rndne_f32_e32 v11, v11 +; GFX10-NEXT: v_fma_f32 v11, -v11, v10, v14 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v11 +; GFX10-NEXT: v_add_f32_e32 v13, v11, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v11, v11, 12 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_13 +; GFX10-NEXT: ; %bb.14: ; %Flow121 +; GFX10-NEXT: v_mov_b32_e32 v13, s2 +; GFX10-NEXT: v_mov_b32_e32 v11, v14 +; GFX10-NEXT: .LBB12_15: ; %frem.loop_exit24 +; GFX10-NEXT: v_add_nc_u32_e32 v13, -11, v13 +; GFX10-NEXT: v_ldexp_f32 v11, v11, v13 +; GFX10-NEXT: v_mul_f32_e32 v12, v11, v12 +; GFX10-NEXT: v_rndne_f32_e32 v12, v12 +; GFX10-NEXT: v_fma_f32 v11, -v12, v10, v11 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v11 +; GFX10-NEXT: v_add_f32_e32 v10, v11, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v9, v10, v9 +; GFX10-NEXT: v_bfi_b32 v9, 0x7fffffff, v9, v1 +; GFX10-NEXT: .LBB12_16: +; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v2|, |v6| +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_vccz .LBB12_18 +; GFX10-NEXT: ; %bb.17: ; %frem.else47 +; GFX10-NEXT: v_bfi_b32 v10, 0x7fffffff, 0, v2 +; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v2|, |v6| +; GFX10-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB12_19 +; GFX10-NEXT: s_branch .LBB12_24 +; GFX10-NEXT: .LBB12_18: +; GFX10-NEXT: ; implicit-def: $vgpr10 +; GFX10-NEXT: .LBB12_19: ; %frem.compute46 +; GFX10-NEXT: v_frexp_mant_f32_e64 v11, |v6| +; GFX10-NEXT: v_frexp_mant_f32_e64 v10, |v2| +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v13, v2 +; GFX10-NEXT: v_ldexp_f32 v11, v11, 1 +; GFX10-NEXT: v_ldexp_f32 v12, v10, 12 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v10, v6 +; GFX10-NEXT: v_readfirstlane_b32 s2, v13 +; GFX10-NEXT: v_div_scale_f32 v15, s4, v11, v11, 1.0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v10 +; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v10 +; GFX10-NEXT: v_rcp_f32_e32 v16, v15 +; GFX10-NEXT: v_not_b32_e32 v14, v10 +; GFX10-NEXT: v_add_nc_u32_e32 v14, v14, v13 +; GFX10-NEXT: v_div_scale_f32 v13, vcc_lo, 1.0, v11, 1.0 ; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v10, -v7, v9, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v9 -; GFX10-NEXT: v_mul_f32_e32 v10, v6, v9 -; GFX10-NEXT: v_fma_f32 v11, -v7, v10, v6 -; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v9 -; GFX10-NEXT: v_fma_f32 v6, -v7, v10, v6 +; GFX10-NEXT: v_fma_f32 v17, -v15, v16, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v16, v17, v16 +; GFX10-NEXT: v_mul_f32_e32 v17, v13, v16 +; GFX10-NEXT: v_fma_f32 v18, -v15, v17, v13 +; GFX10-NEXT: v_fmac_f32_e32 v17, v18, v16 +; GFX10-NEXT: v_fma_f32 v13, -v15, v17, v13 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v10 -; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1 -; GFX10-NEXT: v_trunc_f32_e32 v6, v6 -; GFX10-NEXT: v_fma_f32 v1, -v6, v5, v1 -; GFX10-NEXT: v_div_scale_f32 v6, s2, v4, v4, v0 -; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 -; GFX10-NEXT: v_rcp_f32_e32 v7, v6 +; GFX10-NEXT: v_div_fmas_f32 v13, v13, v16, v17 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v14 +; GFX10-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0 +; GFX10-NEXT: s_cbranch_vccnz .LBB12_23 +; GFX10-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s2, s2, 12 +; GFX10-NEXT: .LBB12_21: ; %frem.loop_body54 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v15, v12 +; GFX10-NEXT: s_add_i32 s2, s2, -12 +; GFX10-NEXT: s_cmp_gt_i32 s2, 12 +; GFX10-NEXT: v_mul_f32_e32 v12, v15, v13 +; GFX10-NEXT: v_rndne_f32_e32 v12, v12 +; GFX10-NEXT: v_fma_f32 v12, -v12, v11, v15 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v12 +; GFX10-NEXT: v_add_f32_e32 v14, v12, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v12, v12, 12 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_21 +; GFX10-NEXT: ; %bb.22: ; %Flow117 +; GFX10-NEXT: v_mov_b32_e32 v14, s2 +; GFX10-NEXT: v_mov_b32_e32 v12, v15 +; GFX10-NEXT: .LBB12_23: ; %frem.loop_exit55 +; GFX10-NEXT: v_add_nc_u32_e32 v14, -11, v14 +; GFX10-NEXT: v_ldexp_f32 v12, v12, v14 +; GFX10-NEXT: v_mul_f32_e32 v13, v12, v13 +; GFX10-NEXT: v_rndne_f32_e32 v13, v13 +; GFX10-NEXT: v_fma_f32 v12, -v13, v11, v12 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v12 +; GFX10-NEXT: v_add_f32_e32 v11, v12, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v10, v11, v10 +; GFX10-NEXT: v_bfi_b32 v10, 0x7fffffff, v10, v2 +; GFX10-NEXT: .LBB12_24: +; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v3|, |v7| +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_vccz .LBB12_26 +; GFX10-NEXT: ; %bb.25: ; %frem.else78 +; GFX10-NEXT: v_bfi_b32 v11, 0x7fffffff, 0, v3 +; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v3|, |v7| +; GFX10-NEXT: v_cndmask_b32_e32 v11, v3, v11, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB12_27 +; GFX10-NEXT: s_branch .LBB12_32 +; GFX10-NEXT: .LBB12_26: +; GFX10-NEXT: ; implicit-def: $vgpr11 +; GFX10-NEXT: .LBB12_27: ; %frem.compute77 +; GFX10-NEXT: v_frexp_mant_f32_e64 v12, |v7| +; GFX10-NEXT: v_frexp_mant_f32_e64 v11, |v3| +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v14, v3 +; GFX10-NEXT: v_ldexp_f32 v12, v12, 1 +; GFX10-NEXT: v_ldexp_f32 v13, v11, 12 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v11, v7 +; GFX10-NEXT: v_readfirstlane_b32 s2, v14 +; GFX10-NEXT: v_div_scale_f32 v16, s4, v12, v12, 1.0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v11 +; GFX10-NEXT: v_add_nc_u32_e32 v11, -1, v11 +; GFX10-NEXT: v_rcp_f32_e32 v17, v16 +; GFX10-NEXT: v_not_b32_e32 v15, v11 +; GFX10-NEXT: v_add_nc_u32_e32 v15, v15, v14 +; GFX10-NEXT: v_div_scale_f32 v14, vcc_lo, 1.0, v12, 1.0 ; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v9, -v6, v7, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v7 -; GFX10-NEXT: v_mul_f32_e32 v9, v5, v7 -; GFX10-NEXT: v_fma_f32 v10, -v6, v9, v5 -; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v7 -; GFX10-NEXT: v_fma_f32 v5, -v6, v9, v5 +; GFX10-NEXT: v_fma_f32 v18, -v16, v17, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v17, v18, v17 +; GFX10-NEXT: v_mul_f32_e32 v18, v14, v17 +; GFX10-NEXT: v_fma_f32 v19, -v16, v18, v14 +; GFX10-NEXT: v_fmac_f32_e32 v18, v19, v17 +; GFX10-NEXT: v_fma_f32 v14, -v16, v18, v14 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v9 -; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v0 -; GFX10-NEXT: v_trunc_f32_e32 v5, v5 -; GFX10-NEXT: v_fma_f32 v0, -v5, v4, v0 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX10-NEXT: v_div_fmas_f32 v14, v14, v17, v18 +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v15 +; GFX10-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0 +; GFX10-NEXT: s_cbranch_vccnz .LBB12_31 +; GFX10-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s2, s2, 12 +; GFX10-NEXT: .LBB12_29: ; %frem.loop_body85 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v16, v13 +; GFX10-NEXT: s_add_i32 s2, s2, -12 +; GFX10-NEXT: s_cmp_gt_i32 s2, 12 +; GFX10-NEXT: v_mul_f32_e32 v13, v16, v14 +; GFX10-NEXT: v_rndne_f32_e32 v13, v13 +; GFX10-NEXT: v_fma_f32 v13, -v13, v12, v16 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v13 +; GFX10-NEXT: v_add_f32_e32 v15, v13, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v13, v13, 12 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_29 +; GFX10-NEXT: ; %bb.30: ; %Flow +; GFX10-NEXT: v_mov_b32_e32 v15, s2 +; GFX10-NEXT: v_mov_b32_e32 v13, v16 +; GFX10-NEXT: .LBB12_31: ; %frem.loop_exit86 +; GFX10-NEXT: v_add_nc_u32_e32 v15, -11, v15 +; GFX10-NEXT: v_ldexp_f32 v13, v13, v15 +; GFX10-NEXT: v_mul_f32_e32 v14, v13, v14 +; GFX10-NEXT: v_rndne_f32_e32 v14, v14 +; GFX10-NEXT: v_fma_f32 v13, -v14, v12, v13 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v13 +; GFX10-NEXT: v_add_f32_e32 v12, v13, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v11, v12, v11 +; GFX10-NEXT: v_bfi_b32 v11, 0x7fffffff, v11, v3 +; GFX10-NEXT: .LBB12_32: ; %Flow116 +; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x7f800000, |v0| +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x7f800000, |v1| +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v8, vcc_lo +; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v5 +; GFX10-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x7f800000, |v2| +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v9, vcc_lo +; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x7f800000, |v3| +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v10, vcc_lo +; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v7 +; GFX10-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v11, vcc_lo +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: frem_v4f32: @@ -4891,97 +14713,386 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3] -; GFX11-NEXT: global_load_b128 v[4:7], v8, s[4:5] offset:64 +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] +; GFX11-NEXT: global_load_b128 v[4:7], v4, s[4:5] offset:64 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v10, null, v7, v7, v3 -; GFX11-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v11, v10 +; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v4| +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccz .LBB12_2 +; GFX11-NEXT: ; %bb.1: ; %frem.else +; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, 0, v0 +; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v4| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc_lo +; GFX11-NEXT: s_cbranch_execz .LBB12_3 +; GFX11-NEXT: s_branch .LBB12_8 +; GFX11-NEXT: .LBB12_2: +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: .LBB12_3: ; %frem.compute +; GFX11-NEXT: v_frexp_mant_f32_e64 v9, |v4| +; GFX11-NEXT: v_frexp_mant_f32_e64 v8, |v0| +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v11, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_ldexp_f32 v9, v9, 1 +; GFX11-NEXT: v_ldexp_f32 v10, v8, 12 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v8, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_readfirstlane_b32 s2, v11 +; GFX11-NEXT: v_div_scale_f32 v13, null, v9, v9, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readfirstlane_b32 s3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v8, -1, v8 +; GFX11-NEXT: v_rcp_f32_e32 v14, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v12, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v11 +; GFX11-NEXT: v_div_scale_f32 v11, vcc_lo, 1.0, v9, 1.0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v12, -v10, v11, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v11 +; GFX11-NEXT: v_fma_f32 v15, -v13, v14, 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v12, v9, v11 -; GFX11-NEXT: v_fma_f32 v13, -v10, v12, v9 +; GFX11-NEXT: v_fmac_f32_e32 v14, v15, v14 +; GFX11-NEXT: v_mul_f32_e32 v15, v11, v14 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v12, v13, v11 -; GFX11-NEXT: v_fma_f32 v9, -v10, v12, v9 +; GFX11-NEXT: v_fma_f32 v16, -v13, v15, v11 +; GFX11-NEXT: v_fmac_f32_e32 v15, v16, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v11, -v13, v15, v11 ; GFX11-NEXT: s_denorm_mode 12 +; GFX11-NEXT: v_div_fmas_f32 v11, v11, v14, v15 +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 +; GFX11-NEXT: s_cbranch_vccnz .LBB12_7 +; GFX11-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s2, s2, 12 +; GFX11-NEXT: .LBB12_5: ; %frem.loop_body +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v13, v10 +; GFX11-NEXT: s_add_i32 s2, s2, -12 +; GFX11-NEXT: s_cmp_gt_i32 s2, 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v9, v9, v11, v12 -; GFX11-NEXT: v_div_fixup_f32 v9, v9, v7, v3 +; GFX11-NEXT: v_mul_f32_e32 v10, v13, v11 +; GFX11-NEXT: v_rndne_f32_e32 v10, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v9, v9 -; GFX11-NEXT: v_fma_f32 v3, -v9, v7, v3 -; GFX11-NEXT: v_div_scale_f32 v9, null, v6, v6, v2 -; GFX11-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v10, v9 +; GFX11-NEXT: v_fma_f32 v10, -v10, v9, v13 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v10 +; GFX11-NEXT: v_add_f32_e32 v12, v10, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX11-NEXT: v_ldexp_f32 v10, v10, 12 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_5 +; GFX11-NEXT: ; %bb.6: ; %Flow125 +; GFX11-NEXT: v_mov_b32_e32 v12, s2 +; GFX11-NEXT: v_mov_b32_e32 v10, v13 +; GFX11-NEXT: .LBB12_7: ; %frem.loop_exit +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v12, -11, v12 +; GFX11-NEXT: v_ldexp_f32 v10, v10, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v11, v10, v11 +; GFX11-NEXT: v_rndne_f32_e32 v11, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v10, -v11, v9, v10 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v10 +; GFX11-NEXT: v_add_f32_e32 v9, v10, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc_lo +; GFX11-NEXT: v_ldexp_f32 v8, v9, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, v8, v0 +; GFX11-NEXT: .LBB12_8: +; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v5| +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccz .LBB12_10 +; GFX11-NEXT: ; %bb.9: ; %frem.else16 +; GFX11-NEXT: v_bfi_b32 v9, 0x7fffffff, 0, v1 +; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v5| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v1, v9, vcc_lo +; GFX11-NEXT: s_cbranch_execz .LBB12_11 +; GFX11-NEXT: s_branch .LBB12_16 +; GFX11-NEXT: .LBB12_10: +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: .LBB12_11: ; %frem.compute15 +; GFX11-NEXT: v_frexp_mant_f32_e64 v10, |v5| +; GFX11-NEXT: v_frexp_mant_f32_e64 v9, |v1| +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v12, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_ldexp_f32 v10, v10, 1 +; GFX11-NEXT: v_ldexp_f32 v11, v9, 12 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v9, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_readfirstlane_b32 s2, v12 +; GFX11-NEXT: v_div_scale_f32 v14, null, v10, v10, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readfirstlane_b32 s3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v9, -1, v9 +; GFX11-NEXT: v_rcp_f32_e32 v15, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v13, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v12 +; GFX11-NEXT: v_div_scale_f32 v12, vcc_lo, 1.0, v10, 1.0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v11, -v9, v10, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v10, v11, v10 +; GFX11-NEXT: v_fma_f32 v16, -v14, v15, 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v11, v7, v10 -; GFX11-NEXT: v_fma_f32 v12, -v9, v11, v7 +; GFX11-NEXT: v_fmac_f32_e32 v15, v16, v15 +; GFX11-NEXT: v_mul_f32_e32 v16, v12, v15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v10 -; GFX11-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX11-NEXT: v_fma_f32 v17, -v14, v16, v12 +; GFX11-NEXT: v_fmac_f32_e32 v16, v17, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v12, -v14, v16, v12 ; GFX11-NEXT: s_denorm_mode 12 +; GFX11-NEXT: v_div_fmas_f32 v12, v12, v15, v16 +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0 +; GFX11-NEXT: s_cbranch_vccnz .LBB12_15 +; GFX11-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX11-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s2, s2, 12 +; GFX11-NEXT: .LBB12_13: ; %frem.loop_body23 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v14, v11 +; GFX11-NEXT: s_add_i32 s2, s2, -12 +; GFX11-NEXT: s_cmp_gt_i32 s2, 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v7, v7, v10, v11 -; GFX11-NEXT: v_div_fixup_f32 v7, v7, v6, v2 +; GFX11-NEXT: v_mul_f32_e32 v11, v14, v12 +; GFX11-NEXT: v_rndne_f32_e32 v11, v11 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v7, v7 -; GFX11-NEXT: v_fma_f32 v2, -v7, v6, v2 -; GFX11-NEXT: v_div_scale_f32 v7, null, v5, v5, v1 -; GFX11-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v9, v7 +; GFX11-NEXT: v_fma_f32 v11, -v11, v10, v14 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v11 +; GFX11-NEXT: v_add_f32_e32 v13, v11, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc_lo +; GFX11-NEXT: v_ldexp_f32 v11, v11, 12 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_13 +; GFX11-NEXT: ; %bb.14: ; %Flow121 +; GFX11-NEXT: v_mov_b32_e32 v13, s2 +; GFX11-NEXT: v_mov_b32_e32 v11, v14 +; GFX11-NEXT: .LBB12_15: ; %frem.loop_exit24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v13, -11, v13 +; GFX11-NEXT: v_ldexp_f32 v11, v11, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v12, v11, v12 +; GFX11-NEXT: v_rndne_f32_e32 v12, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v11, -v12, v10, v11 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v11 +; GFX11-NEXT: v_add_f32_e32 v10, v11, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc_lo +; GFX11-NEXT: v_ldexp_f32 v9, v10, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v9, 0x7fffffff, v9, v1 +; GFX11-NEXT: .LBB12_16: +; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v2|, |v6| +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccz .LBB12_18 +; GFX11-NEXT: ; %bb.17: ; %frem.else47 +; GFX11-NEXT: v_bfi_b32 v10, 0x7fffffff, 0, v2 +; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v2|, |v6| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc_lo +; GFX11-NEXT: s_cbranch_execz .LBB12_19 +; GFX11-NEXT: s_branch .LBB12_24 +; GFX11-NEXT: .LBB12_18: +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: .LBB12_19: ; %frem.compute46 +; GFX11-NEXT: v_frexp_mant_f32_e64 v11, |v6| +; GFX11-NEXT: v_frexp_mant_f32_e64 v10, |v2| +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v13, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_ldexp_f32 v11, v11, 1 +; GFX11-NEXT: v_ldexp_f32 v12, v10, 12 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v10, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_readfirstlane_b32 s2, v13 +; GFX11-NEXT: v_div_scale_f32 v15, null, v11, v11, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readfirstlane_b32 s3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v10 +; GFX11-NEXT: v_rcp_f32_e32 v16, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v14, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v14, v14, v13 +; GFX11-NEXT: v_div_scale_f32 v13, vcc_lo, 1.0, v11, 1.0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v10, -v7, v9, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v9, v10, v9 +; GFX11-NEXT: v_fma_f32 v17, -v15, v16, 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v10, v6, v9 -; GFX11-NEXT: v_fma_f32 v11, -v7, v10, v6 +; GFX11-NEXT: v_fmac_f32_e32 v16, v17, v16 +; GFX11-NEXT: v_mul_f32_e32 v17, v13, v16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v10, v11, v9 -; GFX11-NEXT: v_fma_f32 v6, -v7, v10, v6 +; GFX11-NEXT: v_fma_f32 v18, -v15, v17, v13 +; GFX11-NEXT: v_fmac_f32_e32 v17, v18, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v13, -v15, v17, v13 ; GFX11-NEXT: s_denorm_mode 12 +; GFX11-NEXT: v_div_fmas_f32 v13, v13, v16, v17 +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0 +; GFX11-NEXT: s_cbranch_vccnz .LBB12_23 +; GFX11-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; GFX11-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s2, s2, 12 +; GFX11-NEXT: .LBB12_21: ; %frem.loop_body54 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v15, v12 +; GFX11-NEXT: s_add_i32 s2, s2, -12 +; GFX11-NEXT: s_cmp_gt_i32 s2, 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v6, v6, v9, v10 -; GFX11-NEXT: v_div_fixup_f32 v6, v6, v5, v1 +; GFX11-NEXT: v_mul_f32_e32 v12, v15, v13 +; GFX11-NEXT: v_rndne_f32_e32 v12, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v6, v6 -; GFX11-NEXT: v_fma_f32 v1, -v6, v5, v1 -; GFX11-NEXT: v_div_scale_f32 v6, null, v4, v4, v0 -; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v7, v6 +; GFX11-NEXT: v_fma_f32 v12, -v12, v11, v15 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v12 +; GFX11-NEXT: v_add_f32_e32 v14, v12, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc_lo +; GFX11-NEXT: v_ldexp_f32 v12, v12, 12 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_21 +; GFX11-NEXT: ; %bb.22: ; %Flow117 +; GFX11-NEXT: v_mov_b32_e32 v14, s2 +; GFX11-NEXT: v_mov_b32_e32 v12, v15 +; GFX11-NEXT: .LBB12_23: ; %frem.loop_exit55 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v14, -11, v14 +; GFX11-NEXT: v_ldexp_f32 v12, v12, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v13, v12, v13 +; GFX11-NEXT: v_rndne_f32_e32 v13, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v12, -v13, v11, v12 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v12 +; GFX11-NEXT: v_add_f32_e32 v11, v12, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc_lo +; GFX11-NEXT: v_ldexp_f32 v10, v11, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v10, 0x7fffffff, v10, v2 +; GFX11-NEXT: .LBB12_24: +; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v3|, |v7| +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccz .LBB12_26 +; GFX11-NEXT: ; %bb.25: ; %frem.else78 +; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, 0, v3 +; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v3|, |v7| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v11, v3, v11, vcc_lo +; GFX11-NEXT: s_cbranch_execz .LBB12_27 +; GFX11-NEXT: s_branch .LBB12_32 +; GFX11-NEXT: .LBB12_26: +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: .LBB12_27: ; %frem.compute77 +; GFX11-NEXT: v_frexp_mant_f32_e64 v12, |v7| +; GFX11-NEXT: v_frexp_mant_f32_e64 v11, |v3| +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v14, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_ldexp_f32 v12, v12, 1 +; GFX11-NEXT: v_ldexp_f32 v13, v11, 12 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v11, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_readfirstlane_b32 s2, v14 +; GFX11-NEXT: v_div_scale_f32 v16, null, v12, v12, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readfirstlane_b32 s3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v11, -1, v11 +; GFX11-NEXT: v_rcp_f32_e32 v17, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v15, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v14 +; GFX11-NEXT: v_div_scale_f32 v14, vcc_lo, 1.0, v12, 1.0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v9, -v6, v7, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v7, v9, v7 +; GFX11-NEXT: v_fma_f32 v18, -v16, v17, 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v9, v5, v7 -; GFX11-NEXT: v_fma_f32 v10, -v6, v9, v5 +; GFX11-NEXT: v_fmac_f32_e32 v17, v18, v17 +; GFX11-NEXT: v_mul_f32_e32 v18, v14, v17 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v9, v10, v7 -; GFX11-NEXT: v_fma_f32 v5, -v6, v9, v5 +; GFX11-NEXT: v_fma_f32 v19, -v16, v18, v14 +; GFX11-NEXT: v_fmac_f32_e32 v18, v19, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v14, -v16, v18, v14 ; GFX11-NEXT: s_denorm_mode 12 +; GFX11-NEXT: v_div_fmas_f32 v14, v14, v17, v18 +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0 +; GFX11-NEXT: s_cbranch_vccnz .LBB12_31 +; GFX11-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; GFX11-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s2, s2, 12 +; GFX11-NEXT: .LBB12_29: ; %frem.loop_body85 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v16, v13 +; GFX11-NEXT: s_add_i32 s2, s2, -12 +; GFX11-NEXT: s_cmp_gt_i32 s2, 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v5, v5, v7, v9 -; GFX11-NEXT: v_div_fixup_f32 v5, v5, v4, v0 +; GFX11-NEXT: v_mul_f32_e32 v13, v16, v14 +; GFX11-NEXT: v_rndne_f32_e32 v13, v13 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v5, v5 -; GFX11-NEXT: v_fma_f32 v0, -v5, v4, v0 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: v_fma_f32 v13, -v13, v12, v16 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v13 +; GFX11-NEXT: v_add_f32_e32 v15, v13, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc_lo +; GFX11-NEXT: v_ldexp_f32 v13, v13, 12 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_29 +; GFX11-NEXT: ; %bb.30: ; %Flow +; GFX11-NEXT: v_mov_b32_e32 v15, s2 +; GFX11-NEXT: v_mov_b32_e32 v13, v16 +; GFX11-NEXT: .LBB12_31: ; %frem.loop_exit86 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v15, -11, v15 +; GFX11-NEXT: v_ldexp_f32 v13, v13, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v14, v13, v14 +; GFX11-NEXT: v_rndne_f32_e32 v14, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v13, -v14, v12, v13 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v13 +; GFX11-NEXT: v_add_f32_e32 v12, v13, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc_lo +; GFX11-NEXT: v_ldexp_f32 v11, v12, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, v11, v3 +; GFX11-NEXT: .LBB12_32: ; %Flow116 +; GFX11-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v4 +; GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x7f800000, |v0| +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x7f800000, |v1| +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v8, vcc_lo +; GFX11-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v5 +; GFX11-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x7f800000, |v2| +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v9, vcc_lo +; GFX11-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v6 +; GFX11-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x7f800000, |v3| +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v10, vcc_lo +; GFX11-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v7 +; GFX11-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v11, vcc_lo +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_v4f32: @@ -4989,101 +15100,426 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_clause 0x1 ; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-NEXT: v_mov_b32_e32 v8, 0 +; GFX1150-NEXT: v_mov_b32_e32 v4, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b128 v[0:3], v8, s[2:3] -; GFX1150-NEXT: global_load_b128 v[4:7], v8, s[4:5] offset:64 +; GFX1150-NEXT: global_load_b128 v[0:3], v4, s[2:3] ; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_div_scale_f32 v10, null, v7, v7, v3 -; GFX1150-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) -; GFX1150-NEXT: v_rcp_f32_e32 v11, v10 +; GFX1150-NEXT: v_readfirstlane_b32 s10, v1 +; GFX1150-NEXT: v_readfirstlane_b32 s9, v2 +; GFX1150-NEXT: v_readfirstlane_b32 s7, v3 +; GFX1150-NEXT: global_load_b128 v[1:4], v4, s[4:5] offset:64 +; GFX1150-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1150-NEXT: s_and_b32 s5, s8, 0x7fffffff +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_readfirstlane_b32 s6, v1 +; GFX1150-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1150-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1150-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1150-NEXT: s_and_b32 s12, s6, 0x7fffffff +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_cmp_ngt_f32 s5, s12 +; GFX1150-NEXT: s_cbranch_scc0 .LBB12_2 +; GFX1150-NEXT: ; %bb.1: ; %frem.else +; GFX1150-NEXT: s_cmp_eq_f32 s5, s12 +; GFX1150-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s8 +; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo +; GFX1150-NEXT: s_cbranch_execz .LBB12_3 +; GFX1150-NEXT: s_branch .LBB12_8 +; GFX1150-NEXT: .LBB12_2: +; GFX1150-NEXT: ; implicit-def: $vgpr0 +; GFX1150-NEXT: .LBB12_3: ; %frem.compute +; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s6| +; GFX1150-NEXT: v_frexp_mant_f32_e64 v0, |s8| +; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_ldexp_f32 v1, v1, 1 +; GFX1150-NEXT: v_ldexp_f32 v2, v0, 12 +; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v0, s6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1150-NEXT: v_div_scale_f32 v5, null, v1, v1, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_readfirstlane_b32 s12, v0 +; GFX1150-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX1150-NEXT: v_rcp_f32_e32 v6, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_not_b32_e32 v4, v0 +; GFX1150-NEXT: v_add_nc_u32_e32 v4, v4, v3 +; GFX1150-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 ; GFX1150-NEXT: s_denorm_mode 15 -; GFX1150-NEXT: v_fma_f32 v12, -v10, v11, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX1150-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v7, v3, v6 +; GFX1150-NEXT: v_fma_f32 v8, -v5, v7, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f32_e32 v7, v8, v6 +; GFX1150-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX1150-NEXT: s_denorm_mode 12 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4 +; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; GFX1150-NEXT: s_cbranch_vccnz .LBB12_7 +; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-NEXT: s_sub_i32 s11, s11, s12 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_add_i32 s11, s11, 12 +; GFX1150-NEXT: .LBB12_5: ; %frem.loop_body +; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-NEXT: v_mov_b32_e32 v5, v2 +; GFX1150-NEXT: s_add_i32 s11, s11, -12 +; GFX1150-NEXT: s_cmp_gt_i32 s11, 12 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v2, v5, v3 +; GFX1150-NEXT: v_rndne_f32_e32 v2, v2 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f32_e32 v11, v12, v11 -; GFX1150-NEXT: v_mul_f32_e32 v12, v9, v11 +; GFX1150-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX1150-NEXT: v_fma_f32 v2, v2, v1, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1150-NEXT: v_add_f32_e32 v4, v2, v1 +; GFX1150-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_ldexp_f32 v2, v2, 12 +; GFX1150-NEXT: s_cbranch_scc1 .LBB12_5 +; GFX1150-NEXT: ; %bb.6: ; %Flow125 +; GFX1150-NEXT: v_mov_b32_e32 v4, s11 +; GFX1150-NEXT: v_mov_b32_e32 v2, v5 +; GFX1150-NEXT: .LBB12_7: ; %frem.loop_exit +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_add_nc_u32_e32 v4, -11, v4 +; GFX1150-NEXT: v_ldexp_f32 v2, v2, v4 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f32 v13, -v10, v12, v9 -; GFX1150-NEXT: v_fmac_f32_e32 v12, v13, v11 +; GFX1150-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX1150-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1150-NEXT: v_fmac_f32_e32 v2, v3, v1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f32 v9, -v10, v12, v9 -; GFX1150-NEXT: s_denorm_mode 12 -; GFX1150-NEXT: v_div_fmas_f32 v9, v9, v11, v12 +; GFX1150-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1150-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1150-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_div_fixup_f32 v9, v9, v7, v3 -; GFX1150-NEXT: v_trunc_f32_e32 v9, v9 +; GFX1150-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1150-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, s8 +; GFX1150-NEXT: .LBB12_8: +; GFX1150-NEXT: s_and_b32 s8, s10, 0x7fffffff +; GFX1150-NEXT: s_and_b32 s12, s4, 0x7fffffff +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_cmp_ngt_f32 s8, s12 +; GFX1150-NEXT: s_cbranch_scc0 .LBB12_10 +; GFX1150-NEXT: ; %bb.9: ; %frem.else16 +; GFX1150-NEXT: s_cmp_eq_f32 s8, s12 +; GFX1150-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s10 +; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_cndmask_b32_e32 v1, s10, v1, vcc_lo +; GFX1150-NEXT: s_cbranch_execz .LBB12_11 +; GFX1150-NEXT: s_branch .LBB12_16 +; GFX1150-NEXT: .LBB12_10: +; GFX1150-NEXT: ; implicit-def: $vgpr1 +; GFX1150-NEXT: .LBB12_11: ; %frem.compute15 +; GFX1150-NEXT: v_frexp_mant_f32_e64 v2, |s4| +; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s10| +; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_ldexp_f32 v2, v2, 1 +; GFX1150-NEXT: v_ldexp_f32 v3, v1, 12 +; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v1, s4 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-NEXT: v_readfirstlane_b32 s11, v4 +; GFX1150-NEXT: v_div_scale_f32 v6, null, v2, v2, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_readfirstlane_b32 s12, v1 +; GFX1150-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX1150-NEXT: v_rcp_f32_e32 v7, v6 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 -; GFX1150-NEXT: v_fma_f32 v3, v9, v7, v3 -; GFX1150-NEXT: v_div_scale_f32 v9, null, v6, v6, v2 -; GFX1150-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) -; GFX1150-NEXT: v_rcp_f32_e32 v10, v9 +; GFX1150-NEXT: v_not_b32_e32 v5, v1 +; GFX1150-NEXT: v_add_nc_u32_e32 v5, v5, v4 +; GFX1150-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0 ; GFX1150-NEXT: s_denorm_mode 15 -; GFX1150-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v8, -v6, v7, 1.0 +; GFX1150-NEXT: v_fmac_f32_e32 v7, v8, v7 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f32_e32 v10, v11, v10 -; GFX1150-NEXT: v_mul_f32_e32 v11, v7, v10 +; GFX1150-NEXT: v_mul_f32_e32 v8, v4, v7 +; GFX1150-NEXT: v_fma_f32 v9, -v6, v8, v4 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f32 v12, -v9, v11, v7 -; GFX1150-NEXT: v_fmac_f32_e32 v11, v12, v10 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX1150-NEXT: v_fmac_f32_e32 v8, v9, v7 +; GFX1150-NEXT: v_fma_f32 v4, -v6, v8, v4 ; GFX1150-NEXT: s_denorm_mode 12 -; GFX1150-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-NEXT: v_div_fmas_f32 v4, v4, v7, v8 +; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5 +; GFX1150-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; GFX1150-NEXT: s_cbranch_vccnz .LBB12_15 +; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1150-NEXT: s_sub_i32 s11, s11, s12 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_add_i32 s11, s11, 12 +; GFX1150-NEXT: .LBB12_13: ; %frem.loop_body23 +; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-NEXT: v_mov_b32_e32 v6, v3 +; GFX1150-NEXT: s_add_i32 s11, s11, -12 +; GFX1150-NEXT: s_cmp_gt_i32 s11, 12 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_div_fixup_f32 v7, v7, v6, v2 -; GFX1150-NEXT: v_trunc_f32_e32 v7, v7 +; GFX1150-NEXT: v_mul_f32_e32 v3, v6, v4 +; GFX1150-NEXT: v_rndne_f32_e32 v3, v3 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 -; GFX1150-NEXT: v_fma_f32 v2, v7, v6, v2 -; GFX1150-NEXT: v_div_scale_f32 v7, null, v5, v5, v1 -; GFX1150-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) -; GFX1150-NEXT: v_rcp_f32_e32 v9, v7 +; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1150-NEXT: v_fma_f32 v3, v3, v2, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1150-NEXT: v_add_f32_e32 v5, v3, v2 +; GFX1150-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_ldexp_f32 v3, v3, 12 +; GFX1150-NEXT: s_cbranch_scc1 .LBB12_13 +; GFX1150-NEXT: ; %bb.14: ; %Flow121 +; GFX1150-NEXT: v_mov_b32_e32 v5, s11 +; GFX1150-NEXT: v_mov_b32_e32 v3, v6 +; GFX1150-NEXT: .LBB12_15: ; %frem.loop_exit24 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_add_nc_u32_e32 v5, -11, v5 +; GFX1150-NEXT: v_ldexp_f32 v3, v3, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX1150-NEXT: v_rndne_f32_e32 v4, v4 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1150-NEXT: v_fmac_f32_e32 v3, v4, v2 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1150-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX1150-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1150-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s10 +; GFX1150-NEXT: .LBB12_16: +; GFX1150-NEXT: s_and_b32 s10, s9, 0x7fffffff +; GFX1150-NEXT: s_and_b32 s12, s3, 0x7fffffff +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_cmp_ngt_f32 s10, s12 +; GFX1150-NEXT: s_cbranch_scc0 .LBB12_18 +; GFX1150-NEXT: ; %bb.17: ; %frem.else47 +; GFX1150-NEXT: s_cmp_eq_f32 s10, s12 +; GFX1150-NEXT: v_bfi_b32 v2, 0x7fffffff, 0, s9 +; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_cndmask_b32_e32 v2, s9, v2, vcc_lo +; GFX1150-NEXT: s_cbranch_execz .LBB12_19 +; GFX1150-NEXT: s_branch .LBB12_24 +; GFX1150-NEXT: .LBB12_18: +; GFX1150-NEXT: ; implicit-def: $vgpr2 +; GFX1150-NEXT: .LBB12_19: ; %frem.compute46 +; GFX1150-NEXT: v_frexp_mant_f32_e64 v3, |s3| +; GFX1150-NEXT: v_frexp_mant_f32_e64 v2, |s9| +; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v5, s9 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_ldexp_f32 v3, v3, 1 +; GFX1150-NEXT: v_ldexp_f32 v4, v2, 12 +; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v2, s3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-NEXT: v_readfirstlane_b32 s11, v5 +; GFX1150-NEXT: v_div_scale_f32 v7, null, v3, v3, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_readfirstlane_b32 s12, v2 +; GFX1150-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX1150-NEXT: v_rcp_f32_e32 v8, v7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_not_b32_e32 v6, v2 +; GFX1150-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX1150-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 ; GFX1150-NEXT: s_denorm_mode 15 -; GFX1150-NEXT: v_fma_f32 v10, -v7, v9, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; GFX1150-NEXT: v_fmac_f32_e32 v8, v9, v8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v9, v5, v8 +; GFX1150-NEXT: v_fma_f32 v10, -v7, v9, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX1150-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX1150-NEXT: s_denorm_mode 12 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v6 +; GFX1150-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX1150-NEXT: s_cbranch_vccnz .LBB12_23 +; GFX1150-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; GFX1150-NEXT: s_sub_i32 s11, s11, s12 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_add_i32 s11, s11, 12 +; GFX1150-NEXT: .LBB12_21: ; %frem.loop_body54 +; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-NEXT: v_mov_b32_e32 v7, v4 +; GFX1150-NEXT: s_add_i32 s11, s11, -12 +; GFX1150-NEXT: s_cmp_gt_i32 s11, 12 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v4, v7, v5 +; GFX1150-NEXT: v_rndne_f32_e32 v4, v4 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1150-NEXT: v_fma_f32 v4, v4, v3, v7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1150-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX1150-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_ldexp_f32 v4, v4, 12 +; GFX1150-NEXT: s_cbranch_scc1 .LBB12_21 +; GFX1150-NEXT: ; %bb.22: ; %Flow117 +; GFX1150-NEXT: v_mov_b32_e32 v6, s11 +; GFX1150-NEXT: v_mov_b32_e32 v4, v7 +; GFX1150-NEXT: .LBB12_23: ; %frem.loop_exit55 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_add_nc_u32_e32 v6, -11, v6 +; GFX1150-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX1150-NEXT: v_rndne_f32_e32 v5, v5 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1150-NEXT: v_fmac_f32_e32 v4, v5, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1150-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1150-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s9 +; GFX1150-NEXT: .LBB12_24: +; GFX1150-NEXT: s_and_b32 s9, s7, 0x7fffffff +; GFX1150-NEXT: s_and_b32 s12, s2, 0x7fffffff +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_cmp_ngt_f32 s9, s12 +; GFX1150-NEXT: s_cbranch_scc0 .LBB12_26 +; GFX1150-NEXT: ; %bb.25: ; %frem.else78 +; GFX1150-NEXT: s_cmp_eq_f32 s9, s12 +; GFX1150-NEXT: v_bfi_b32 v3, 0x7fffffff, 0, s7 +; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_cndmask_b32_e32 v3, s7, v3, vcc_lo +; GFX1150-NEXT: s_cbranch_execz .LBB12_27 +; GFX1150-NEXT: s_branch .LBB12_32 +; GFX1150-NEXT: .LBB12_26: +; GFX1150-NEXT: ; implicit-def: $vgpr3 +; GFX1150-NEXT: .LBB12_27: ; %frem.compute77 +; GFX1150-NEXT: v_frexp_mant_f32_e64 v4, |s2| +; GFX1150-NEXT: v_frexp_mant_f32_e64 v3, |s7| +; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v6, s7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_ldexp_f32 v4, v4, 1 +; GFX1150-NEXT: v_ldexp_f32 v5, v3, 12 +; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v3, s2 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1150-NEXT: v_readfirstlane_b32 s11, v6 +; GFX1150-NEXT: v_div_scale_f32 v8, null, v4, v4, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_readfirstlane_b32 s12, v3 +; GFX1150-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; GFX1150-NEXT: v_rcp_f32_e32 v9, v8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_not_b32_e32 v7, v3 +; GFX1150-NEXT: v_add_nc_u32_e32 v7, v7, v6 +; GFX1150-NEXT: v_div_scale_f32 v6, vcc_lo, 1.0, v4, 1.0 +; GFX1150-NEXT: s_denorm_mode 15 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v10, -v8, v9, 1.0 ; GFX1150-NEXT: v_fmac_f32_e32 v9, v10, v9 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_mul_f32_e32 v10, v6, v9 +; GFX1150-NEXT: v_fma_f32 v11, -v8, v10, v6 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f32 v11, -v7, v10, v6 ; GFX1150-NEXT: v_fmac_f32_e32 v10, v11, v9 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f32 v6, -v7, v10, v6 +; GFX1150-NEXT: v_fma_f32 v6, -v8, v10, v6 ; GFX1150-NEXT: s_denorm_mode 12 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-NEXT: v_div_fmas_f32 v6, v6, v9, v10 +; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v7 +; GFX1150-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; GFX1150-NEXT: s_cbranch_vccnz .LBB12_31 +; GFX1150-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; GFX1150-NEXT: s_sub_i32 s11, s11, s12 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_add_i32 s11, s11, 12 +; GFX1150-NEXT: .LBB12_29: ; %frem.loop_body85 +; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-NEXT: v_mov_b32_e32 v8, v5 +; GFX1150-NEXT: s_add_i32 s11, s11, -12 +; GFX1150-NEXT: s_cmp_gt_i32 s11, 12 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_div_fixup_f32 v6, v6, v5, v1 -; GFX1150-NEXT: v_trunc_f32_e32 v6, v6 +; GFX1150-NEXT: v_mul_f32_e32 v5, v8, v6 +; GFX1150-NEXT: v_rndne_f32_e32 v5, v5 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 -; GFX1150-NEXT: v_fma_f32 v1, v6, v5, v1 -; GFX1150-NEXT: v_div_scale_f32 v6, null, v4, v4, v0 -; GFX1150-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) -; GFX1150-NEXT: v_rcp_f32_e32 v7, v6 -; GFX1150-NEXT: s_denorm_mode 15 -; GFX1150-NEXT: v_fma_f32 v9, -v6, v7, 1.0 +; GFX1150-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1150-NEXT: v_fma_f32 v5, v5, v4, v8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1150-NEXT: v_add_f32_e32 v7, v5, v4 +; GFX1150-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_ldexp_f32 v5, v5, 12 +; GFX1150-NEXT: s_cbranch_scc1 .LBB12_29 +; GFX1150-NEXT: ; %bb.30: ; %Flow +; GFX1150-NEXT: v_mov_b32_e32 v7, s11 +; GFX1150-NEXT: v_mov_b32_e32 v5, v8 +; GFX1150-NEXT: .LBB12_31: ; %frem.loop_exit86 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_add_nc_u32_e32 v7, -11, v7 +; GFX1150-NEXT: v_ldexp_f32 v5, v5, v7 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f32_e32 v7, v9, v7 -; GFX1150-NEXT: v_mul_f32_e32 v9, v5, v7 +; GFX1150-NEXT: v_mul_f32_e32 v6, v5, v6 +; GFX1150-NEXT: v_rndne_f32_e32 v6, v6 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f32 v10, -v6, v9, v5 -; GFX1150-NEXT: v_fmac_f32_e32 v9, v10, v7 +; GFX1150-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 +; GFX1150-NEXT: v_fmac_f32_e32 v5, v6, v4 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f32 v5, -v6, v9, v5 -; GFX1150-NEXT: s_denorm_mode 12 -; GFX1150-NEXT: v_div_fmas_f32 v5, v5, v7, v9 +; GFX1150-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1150-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX1150-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_div_fixup_f32 v5, v5, v4, v0 -; GFX1150-NEXT: v_trunc_f32_e32 v5, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 -; GFX1150-NEXT: v_fmac_f32_e32 v0, v5, v4 -; GFX1150-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX1150-NEXT: v_ldexp_f32 v3, v4, v3 +; GFX1150-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, s7 +; GFX1150-NEXT: .LBB12_32: ; %Flow116 +; GFX1150-NEXT: s_cmp_lg_f32 s6, 0 +; GFX1150-NEXT: v_mov_b32_e32 v4, 0 +; GFX1150-NEXT: s_cselect_b32 s6, -1, 0 +; GFX1150-NEXT: s_cmp_nge_f32 s5, 0x7f800000 +; GFX1150-NEXT: s_cselect_b32 s5, -1, 0 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_and_b32 vcc_lo, s5, s6 +; GFX1150-NEXT: s_cmp_lg_f32 s4, 0 +; GFX1150-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo +; GFX1150-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1150-NEXT: s_cmp_nge_f32 s8, 0x7f800000 +; GFX1150-NEXT: s_cselect_b32 s5, -1, 0 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_and_b32 vcc_lo, s5, s4 +; GFX1150-NEXT: s_cmp_lg_f32 s3, 0 +; GFX1150-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v1, vcc_lo +; GFX1150-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1150-NEXT: s_cmp_nge_f32 s10, 0x7f800000 +; GFX1150-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_and_b32 vcc_lo, s4, s3 +; GFX1150-NEXT: s_cmp_lg_f32 s2, 0 +; GFX1150-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v2, vcc_lo +; GFX1150-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1150-NEXT: s_cmp_nge_f32 s9, 0x7f800000 +; GFX1150-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_and_b32 vcc_lo, s3, s2 +; GFX1150-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v3, vcc_lo +; GFX1150-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX1150-NEXT: s_endpgm ; ; GFX1200-LABEL: frem_v4f32: @@ -5091,104 +15527,448 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_clause 0x1 ; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1200-NEXT: v_mov_b32_e32 v8, 0 +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: s_clause 0x1 -; GFX1200-NEXT: global_load_b128 v[0:3], v8, s[2:3] -; GFX1200-NEXT: global_load_b128 v[4:7], v8, s[4:5] offset:64 +; GFX1200-NEXT: global_load_b128 v[0:3], v4, s[2:3] ; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: v_div_scale_f32 v10, null, v7, v7, v3 -; GFX1200-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) -; GFX1200-NEXT: v_rcp_f32_e32 v11, v10 +; GFX1200-NEXT: v_readfirstlane_b32 s10, v1 +; GFX1200-NEXT: v_readfirstlane_b32 s9, v2 +; GFX1200-NEXT: v_readfirstlane_b32 s7, v3 +; GFX1200-NEXT: global_load_b128 v[1:4], v4, s[4:5] offset:64 +; GFX1200-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1200-NEXT: s_and_b32 s5, s8, 0x7fffffff +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_readfirstlane_b32 s6, v1 +; GFX1200-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1200-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1200-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1200-NEXT: s_and_b32 s12, s6, 0x7fffffff +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_cmp_ngt_f32 s5, s12 +; GFX1200-NEXT: s_cbranch_scc0 .LBB12_2 +; GFX1200-NEXT: ; %bb.1: ; %frem.else +; GFX1200-NEXT: s_cmp_eq_f32 s5, s12 +; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s8 +; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo +; GFX1200-NEXT: s_cbranch_execz .LBB12_3 +; GFX1200-NEXT: s_branch .LBB12_8 +; GFX1200-NEXT: .LBB12_2: +; GFX1200-NEXT: ; implicit-def: $vgpr0 +; GFX1200-NEXT: .LBB12_3: ; %frem.compute +; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s6| +; GFX1200-NEXT: v_frexp_mant_f32_e64 v0, |s8| +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_ldexp_f32 v1, v1, 1 +; GFX1200-NEXT: v_ldexp_f32 v2, v0, 12 +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v0, s6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1200-NEXT: v_div_scale_f32 v5, null, v1, v1, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_readfirstlane_b32 s12, v0 +; GFX1200-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX1200-NEXT: v_rcp_f32_e32 v6, v5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_not_b32_e32 v4, v0 +; GFX1200-NEXT: v_add_nc_u32_e32 v4, v4, v3 +; GFX1200-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 ; GFX1200-NEXT: s_denorm_mode 15 -; GFX1200-NEXT: v_fma_f32 v12, -v10, v11, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX1200-NEXT: v_fmac_f32_e32 v6, v7, v6 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fmac_f32_e32 v11, v12, v11 -; GFX1200-NEXT: v_mul_f32_e32 v12, v9, v11 +; GFX1200-NEXT: v_mul_f32_e32 v7, v3, v6 +; GFX1200-NEXT: v_fma_f32 v8, -v5, v7, v3 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f32 v13, -v10, v12, v9 -; GFX1200-NEXT: v_fmac_f32_e32 v12, v13, v11 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f32 v9, -v10, v12, v9 +; GFX1200-NEXT: v_fmac_f32_e32 v7, v8, v6 +; GFX1200-NEXT: v_fma_f32 v3, -v5, v7, v3 ; GFX1200-NEXT: s_denorm_mode 12 -; GFX1200-NEXT: v_div_fmas_f32 v9, v9, v11, v12 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4 +; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; GFX1200-NEXT: s_cbranch_vccnz .LBB12_7 +; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12 +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_add_co_i32 s11, s11, 12 +; GFX1200-NEXT: .LBB12_5: ; %frem.loop_body +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1200-NEXT: v_mov_b32_e32 v5, v2 +; GFX1200-NEXT: s_add_co_i32 s11, s11, -12 +; GFX1200-NEXT: s_cmp_gt_i32 s11, 12 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_div_fixup_f32 v9, v9, v7, v3 -; GFX1200-NEXT: v_trunc_f32_e32 v9, v9 +; GFX1200-NEXT: v_mul_f32_e32 v2, v5, v3 +; GFX1200-NEXT: v_rndne_f32_e32 v2, v2 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 -; GFX1200-NEXT: v_fma_f32 v3, v9, v7, v3 -; GFX1200-NEXT: v_div_scale_f32 v9, null, v6, v6, v2 -; GFX1200-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) -; GFX1200-NEXT: v_rcp_f32_e32 v10, v9 -; GFX1200-NEXT: s_denorm_mode 15 -; GFX1200-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; GFX1200-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX1200-NEXT: v_fma_f32 v2, v2, v1, v5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1200-NEXT: v_add_f32_e32 v4, v2, v1 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_ldexp_f32 v2, v2, 12 +; GFX1200-NEXT: s_cbranch_scc1 .LBB12_5 +; GFX1200-NEXT: ; %bb.6: ; %Flow125 +; GFX1200-NEXT: v_mov_b32_e32 v4, s11 +; GFX1200-NEXT: v_mov_b32_e32 v2, v5 +; GFX1200-NEXT: .LBB12_7: ; %frem.loop_exit +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_add_nc_u32_e32 v4, -11, v4 +; GFX1200-NEXT: v_ldexp_f32 v2, v2, v4 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fmac_f32_e32 v10, v11, v10 -; GFX1200-NEXT: v_mul_f32_e32 v11, v7, v10 +; GFX1200-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX1200-NEXT: v_rndne_f32_e32 v3, v3 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f32 v12, -v9, v11, v7 -; GFX1200-NEXT: v_fmac_f32_e32 v11, v12, v10 +; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1200-NEXT: v_fmac_f32_e32 v2, v3, v1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 +; GFX1200-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, s8 +; GFX1200-NEXT: .LBB12_8: +; GFX1200-NEXT: s_and_b32 s8, s10, 0x7fffffff +; GFX1200-NEXT: s_and_b32 s12, s4, 0x7fffffff +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_ngt_f32 s8, s12 +; GFX1200-NEXT: s_cbranch_scc0 .LBB12_10 +; GFX1200-NEXT: ; %bb.9: ; %frem.else16 +; GFX1200-NEXT: s_cmp_eq_f32 s8, s12 +; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s10 +; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_cndmask_b32_e32 v1, s10, v1, vcc_lo +; GFX1200-NEXT: s_cbranch_execz .LBB12_11 +; GFX1200-NEXT: s_branch .LBB12_16 +; GFX1200-NEXT: .LBB12_10: +; GFX1200-NEXT: ; implicit-def: $vgpr1 +; GFX1200-NEXT: .LBB12_11: ; %frem.compute15 +; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s4| +; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s10| +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_ldexp_f32 v2, v2, 1 +; GFX1200-NEXT: v_ldexp_f32 v3, v1, 12 +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v1, s4 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-NEXT: v_readfirstlane_b32 s11, v4 +; GFX1200-NEXT: v_div_scale_f32 v6, null, v2, v2, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_readfirstlane_b32 s12, v1 +; GFX1200-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX1200-NEXT: v_rcp_f32_e32 v7, v6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_not_b32_e32 v5, v1 +; GFX1200-NEXT: v_add_nc_u32_e32 v5, v5, v4 +; GFX1200-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0 +; GFX1200-NEXT: s_denorm_mode 15 +; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v8, -v6, v7, 1.0 +; GFX1200-NEXT: v_fmac_f32_e32 v7, v8, v7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f32_e32 v8, v4, v7 +; GFX1200-NEXT: v_fma_f32 v9, -v6, v8, v4 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fmac_f32_e32 v8, v9, v7 +; GFX1200-NEXT: v_fma_f32 v4, -v6, v8, v4 ; GFX1200-NEXT: s_denorm_mode 12 ; GFX1200-NEXT: s_wait_alu 0xfffd -; GFX1200-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-NEXT: v_div_fmas_f32 v4, v4, v7, v8 +; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5 +; GFX1200-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; GFX1200-NEXT: s_cbranch_vccnz .LBB12_15 +; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_i32 s11, s11, 12 +; GFX1200-NEXT: .LBB12_13: ; %frem.loop_body23 +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_mov_b32_e32 v6, v3 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_i32 s11, s11, -12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_gt_i32 s11, 12 +; GFX1200-NEXT: v_mul_f32_e32 v3, v6, v4 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_div_fixup_f32 v7, v7, v6, v2 -; GFX1200-NEXT: v_trunc_f32_e32 v7, v7 +; GFX1200-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 -; GFX1200-NEXT: v_fma_f32 v2, v7, v6, v2 -; GFX1200-NEXT: v_div_scale_f32 v7, null, v5, v5, v1 -; GFX1200-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) -; GFX1200-NEXT: v_rcp_f32_e32 v9, v7 +; GFX1200-NEXT: v_fma_f32 v3, v3, v2, v6 +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1200-NEXT: v_add_f32_e32 v5, v3, v2 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX1200-NEXT: v_ldexp_f32 v3, v3, 12 +; GFX1200-NEXT: s_cbranch_scc1 .LBB12_13 +; GFX1200-NEXT: ; %bb.14: ; %Flow121 +; GFX1200-NEXT: v_mov_b32_e32 v5, s11 +; GFX1200-NEXT: v_mov_b32_e32 v3, v6 +; GFX1200-NEXT: .LBB12_15: ; %frem.loop_exit24 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_add_nc_u32_e32 v5, -11, v5 +; GFX1200-NEXT: v_ldexp_f32 v3, v3, v5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX1200-NEXT: v_rndne_f32_e32 v4, v4 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1200-NEXT: v_fmac_f32_e32 v3, v4, v2 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1200-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s10 +; GFX1200-NEXT: .LBB12_16: +; GFX1200-NEXT: s_and_b32 s10, s9, 0x7fffffff +; GFX1200-NEXT: s_and_b32 s12, s3, 0x7fffffff +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_ngt_f32 s10, s12 +; GFX1200-NEXT: s_cbranch_scc0 .LBB12_18 +; GFX1200-NEXT: ; %bb.17: ; %frem.else47 +; GFX1200-NEXT: s_cmp_eq_f32 s10, s12 +; GFX1200-NEXT: v_bfi_b32 v2, 0x7fffffff, 0, s9 +; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_cndmask_b32_e32 v2, s9, v2, vcc_lo +; GFX1200-NEXT: s_cbranch_execz .LBB12_19 +; GFX1200-NEXT: s_branch .LBB12_24 +; GFX1200-NEXT: .LBB12_18: +; GFX1200-NEXT: ; implicit-def: $vgpr2 +; GFX1200-NEXT: .LBB12_19: ; %frem.compute46 +; GFX1200-NEXT: v_frexp_mant_f32_e64 v3, |s3| +; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s9| +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v5, s9 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_ldexp_f32 v3, v3, 1 +; GFX1200-NEXT: v_ldexp_f32 v4, v2, 12 +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v2, s3 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-NEXT: v_readfirstlane_b32 s11, v5 +; GFX1200-NEXT: v_div_scale_f32 v7, null, v3, v3, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_readfirstlane_b32 s12, v2 +; GFX1200-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX1200-NEXT: v_rcp_f32_e32 v8, v7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_not_b32_e32 v6, v2 +; GFX1200-NEXT: v_add_nc_u32_e32 v6, v6, v5 +; GFX1200-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 ; GFX1200-NEXT: s_denorm_mode 15 -; GFX1200-NEXT: v_fma_f32 v10, -v7, v9, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; GFX1200-NEXT: v_fmac_f32_e32 v8, v9, v8 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f32_e32 v9, v5, v8 +; GFX1200-NEXT: v_fma_f32 v10, -v7, v9, v5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fmac_f32_e32 v9, v10, v8 +; GFX1200-NEXT: v_fma_f32 v5, -v7, v9, v5 +; GFX1200-NEXT: s_denorm_mode 12 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-NEXT: v_div_fmas_f32 v5, v5, v8, v9 +; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v6 +; GFX1200-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 +; GFX1200-NEXT: s_cbranch_vccnz .LBB12_23 +; GFX1200-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_i32 s11, s11, 12 +; GFX1200-NEXT: .LBB12_21: ; %frem.loop_body54 +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_mov_b32_e32 v7, v4 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_i32 s11, s11, -12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_gt_i32 s11, 12 +; GFX1200-NEXT: v_mul_f32_e32 v4, v7, v5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_rndne_f32_e32 v4, v4 +; GFX1200-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v4, v4, v3, v7 +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1200-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX1200-NEXT: v_ldexp_f32 v4, v4, 12 +; GFX1200-NEXT: s_cbranch_scc1 .LBB12_21 +; GFX1200-NEXT: ; %bb.22: ; %Flow117 +; GFX1200-NEXT: v_mov_b32_e32 v6, s11 +; GFX1200-NEXT: v_mov_b32_e32 v4, v7 +; GFX1200-NEXT: .LBB12_23: ; %frem.loop_exit55 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_add_nc_u32_e32 v6, -11, v6 +; GFX1200-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX1200-NEXT: v_rndne_f32_e32 v5, v5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1200-NEXT: v_fmac_f32_e32 v4, v5, v3 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 +; GFX1200-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1200-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s9 +; GFX1200-NEXT: .LBB12_24: +; GFX1200-NEXT: s_and_b32 s9, s7, 0x7fffffff +; GFX1200-NEXT: s_and_b32 s12, s2, 0x7fffffff +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_ngt_f32 s9, s12 +; GFX1200-NEXT: s_cbranch_scc0 .LBB12_26 +; GFX1200-NEXT: ; %bb.25: ; %frem.else78 +; GFX1200-NEXT: s_cmp_eq_f32 s9, s12 +; GFX1200-NEXT: v_bfi_b32 v3, 0x7fffffff, 0, s7 +; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_cndmask_b32_e32 v3, s7, v3, vcc_lo +; GFX1200-NEXT: s_cbranch_execz .LBB12_27 +; GFX1200-NEXT: s_branch .LBB12_32 +; GFX1200-NEXT: .LBB12_26: +; GFX1200-NEXT: ; implicit-def: $vgpr3 +; GFX1200-NEXT: .LBB12_27: ; %frem.compute77 +; GFX1200-NEXT: v_frexp_mant_f32_e64 v4, |s2| +; GFX1200-NEXT: v_frexp_mant_f32_e64 v3, |s7| +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v6, s7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_ldexp_f32 v4, v4, 1 +; GFX1200-NEXT: v_ldexp_f32 v5, v3, 12 +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v3, s2 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-NEXT: v_readfirstlane_b32 s11, v6 +; GFX1200-NEXT: v_div_scale_f32 v8, null, v4, v4, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_readfirstlane_b32 s12, v3 +; GFX1200-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; GFX1200-NEXT: v_rcp_f32_e32 v9, v8 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_not_b32_e32 v7, v3 +; GFX1200-NEXT: v_add_nc_u32_e32 v7, v7, v6 +; GFX1200-NEXT: v_div_scale_f32 v6, vcc_lo, 1.0, v4, 1.0 +; GFX1200-NEXT: s_denorm_mode 15 +; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v10, -v8, v9, 1.0 ; GFX1200-NEXT: v_fmac_f32_e32 v9, v10, v9 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_mul_f32_e32 v10, v6, v9 +; GFX1200-NEXT: v_fma_f32 v11, -v8, v10, v6 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f32 v11, -v7, v10, v6 ; GFX1200-NEXT: v_fmac_f32_e32 v10, v11, v9 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f32 v6, -v7, v10, v6 +; GFX1200-NEXT: v_fma_f32 v6, -v8, v10, v6 ; GFX1200-NEXT: s_denorm_mode 12 ; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_div_fmas_f32 v6, v6, v9, v10 +; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v7 +; GFX1200-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; GFX1200-NEXT: s_cbranch_vccnz .LBB12_31 +; GFX1200-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_i32 s11, s11, 12 +; GFX1200-NEXT: .LBB12_29: ; %frem.loop_body85 +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_mov_b32_e32 v8, v5 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_i32 s11, s11, -12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_gt_i32 s11, 12 +; GFX1200-NEXT: v_mul_f32_e32 v5, v8, v6 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_div_fixup_f32 v6, v6, v5, v1 -; GFX1200-NEXT: v_trunc_f32_e32 v6, v6 +; GFX1200-NEXT: v_rndne_f32_e32 v5, v5 +; GFX1200-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 -; GFX1200-NEXT: v_fma_f32 v1, v6, v5, v1 -; GFX1200-NEXT: v_div_scale_f32 v6, null, v4, v4, v0 -; GFX1200-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) -; GFX1200-NEXT: v_rcp_f32_e32 v7, v6 -; GFX1200-NEXT: s_denorm_mode 15 -; GFX1200-NEXT: v_fma_f32 v9, -v6, v7, 1.0 +; GFX1200-NEXT: v_fma_f32 v5, v5, v4, v8 +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1200-NEXT: v_add_f32_e32 v7, v5, v4 +; GFX1200-NEXT: s_wait_alu 0xfffd ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fmac_f32_e32 v7, v9, v7 -; GFX1200-NEXT: v_mul_f32_e32 v9, v5, v7 +; GFX1200-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX1200-NEXT: v_ldexp_f32 v5, v5, 12 +; GFX1200-NEXT: s_cbranch_scc1 .LBB12_29 +; GFX1200-NEXT: ; %bb.30: ; %Flow +; GFX1200-NEXT: v_mov_b32_e32 v7, s11 +; GFX1200-NEXT: v_mov_b32_e32 v5, v8 +; GFX1200-NEXT: .LBB12_31: ; %frem.loop_exit86 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_add_nc_u32_e32 v7, -11, v7 +; GFX1200-NEXT: v_ldexp_f32 v5, v5, v7 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f32 v10, -v6, v9, v5 -; GFX1200-NEXT: v_fmac_f32_e32 v9, v10, v7 +; GFX1200-NEXT: v_mul_f32_e32 v6, v5, v6 +; GFX1200-NEXT: v_rndne_f32_e32 v6, v6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 +; GFX1200-NEXT: v_fmac_f32_e32 v5, v6, v4 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f32 v5, -v6, v9, v5 -; GFX1200-NEXT: s_denorm_mode 12 +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1200-NEXT: v_add_f32_e32 v4, v5, v4 ; GFX1200-NEXT: s_wait_alu 0xfffd -; GFX1200-NEXT: v_div_fmas_f32 v5, v5, v7, v9 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_div_fixup_f32 v5, v5, v4, v0 -; GFX1200-NEXT: v_trunc_f32_e32 v5, v5 +; GFX1200-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 -; GFX1200-NEXT: v_fmac_f32_e32 v0, v5, v4 -; GFX1200-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX1200-NEXT: v_ldexp_f32 v3, v4, v3 +; GFX1200-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, s7 +; GFX1200-NEXT: .LBB12_32: ; %Flow116 +; GFX1200-NEXT: s_cmp_lg_f32 s6, 0 +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_cselect_b32 s6, -1, 0 +; GFX1200-NEXT: s_cmp_nge_f32 s5, 0x7f800000 +; GFX1200-NEXT: s_cselect_b32 s5, -1, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_and_b32 vcc_lo, s5, s6 +; GFX1200-NEXT: s_cmp_lg_f32 s4, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo +; GFX1200-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1200-NEXT: s_cmp_nge_f32 s8, 0x7f800000 +; GFX1200-NEXT: s_cselect_b32 s5, -1, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_and_b32 vcc_lo, s5, s4 +; GFX1200-NEXT: s_cmp_lg_f32 s3, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v1, vcc_lo +; GFX1200-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-NEXT: s_cmp_nge_f32 s10, 0x7f800000 +; GFX1200-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_and_b32 vcc_lo, s4, s3 +; GFX1200-NEXT: s_cmp_lg_f32 s2, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v2, vcc_lo +; GFX1200-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1200-NEXT: s_cmp_nge_f32 s9, 0x7f800000 +; GFX1200-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_and_b32 vcc_lo, s3, s2 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v3, vcc_lo +; GFX1200-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX1200-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4 @@ -5202,131 +15982,431 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_mov_b32 s0, s2 -; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s4, s10 +; SI-NEXT: s_mov_b32 s5, s11 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] -; SI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; SI-NEXT: v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3] -; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] -; SI-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13] -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v13 +; SI-NEXT: v_cmp_ngt_f64_e64 s[0:1], |v[0:1]|, |v[4:5]| +; SI-NEXT: s_and_b64 vcc, exec, s[0:1] +; SI-NEXT: s_cbranch_vccz .LBB13_2 +; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: v_and_b32_e32 v8, 0x80000000, v1 +; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]| +; SI-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc +; SI-NEXT: v_cndmask_b32_e64 v8, v0, 0, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB13_3 +; SI-NEXT: s_branch .LBB13_8 +; SI-NEXT: .LBB13_2: +; SI-NEXT: ; implicit-def: $vgpr8_vgpr9 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB13_3: ; %frem.compute +; SI-NEXT: s_brev_b32 s5, -2 +; SI-NEXT: v_and_b32_e32 v10, 0x7fffffff, v1 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s1, 0x7ff00000 +; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[0:1] +; SI-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| +; SI-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc +; SI-NEXT: v_frexp_exp_i32_f64_e32 v10, v[0:1] +; SI-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s2, v10 +; SI-NEXT: s_cselect_b32 s3, s2, 0 +; SI-NEXT: v_ldexp_f64 v[10:11], v[8:9], 26 +; SI-NEXT: v_and_b32_e32 v12, 0x7fffffff, v5 +; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[4:5]|, s[0:1] +; SI-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[4:5]| +; SI-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v4, v8, vcc +; SI-NEXT: v_frexp_exp_i32_f64_e32 v12, v[4:5] +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s0, v12 +; SI-NEXT: s_cselect_b32 s7, s0, 0 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_add_i32 s4, s7, -1 +; SI-NEXT: v_ldexp_f64 v[8:9], v[8:9], 1 +; SI-NEXT: s_not_b32 s0, s4 +; SI-NEXT: s_add_i32 s6, s0, s3 +; SI-NEXT: v_div_scale_f64 v[12:13], s[0:1], v[8:9], v[8:9], 1.0 +; SI-NEXT: v_rcp_f64_e32 v[14:15], v[12:13] +; SI-NEXT: v_fma_f64 v[16:17], -v[12:13], v[14:15], 1.0 +; SI-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; SI-NEXT: v_fma_f64 v[16:17], -v[12:13], v[14:15], 1.0 +; SI-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; SI-NEXT: v_div_scale_f64 v[16:17], s[0:1], 1.0, v[8:9], 1.0 +; SI-NEXT: v_mul_f64 v[18:19], v[16:17], v[14:15] +; SI-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], v[16:17] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v9, v13 +; SI-NEXT: s_mov_b32 s0, 0x3ff00000 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, v17 ; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc -; SI-NEXT: s_nop 1 -; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15] -; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] -; SI-NEXT: v_readfirstlane_b32 s0, v8 -; SI-NEXT: v_readfirstlane_b32 s1, v9 -; SI-NEXT: s_bfe_u32 s2, s1, 0xb0014 -; SI-NEXT: s_add_i32 s10, s2, 0xfffffc01 -; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_lshr_b64 s[8:9], s[2:3], s10 -; SI-NEXT: s_andn2_b64 s[8:9], s[0:1], s[8:9] -; SI-NEXT: s_and_b32 s11, s1, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s10, 0 -; SI-NEXT: s_cselect_b32 s8, 0, s8 -; SI-NEXT: s_cselect_b32 s9, s11, s9 -; SI-NEXT: s_cmp_gt_i32 s10, 51 -; SI-NEXT: s_cselect_b32 s1, s1, s9 -; SI-NEXT: s_cselect_b32 s0, s0, s8 -; SI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[6:7], v[2:3] -; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] -; SI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] -; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; SI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1] -; SI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] -; SI-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11] -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v11 +; SI-NEXT: s_nop 0 +; SI-NEXT: v_div_fmas_f64 v[12:13], v[20:21], v[14:15], v[18:19] +; SI-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 +; SI-NEXT: s_cmp_lt_i32 s6, 27 +; SI-NEXT: s_cbranch_scc1 .LBB13_7 +; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: s_sub_i32 s0, s3, s7 +; SI-NEXT: s_add_i32 s6, s0, 26 +; SI-NEXT: s_mov_b32 s3, 0x432fffff +; SI-NEXT: v_mov_b32_e32 v18, 0x43300000 +; SI-NEXT: v_mov_b32_e32 v14, 0 +; SI-NEXT: .LBB13_5: ; %frem.loop_body +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v17, v11 +; SI-NEXT: v_mov_b32_e32 v16, v10 +; SI-NEXT: v_mul_f64 v[10:11], v[16:17], v[12:13] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[10:11]|, s[2:3] +; SI-NEXT: v_bfi_b32 v15, s5, v18, v11 +; SI-NEXT: v_add_f64 v[19:20], v[10:11], v[14:15] +; SI-NEXT: v_add_f64 v[19:20], v[19:20], -v[14:15] +; SI-NEXT: v_cndmask_b32_e32 v11, v20, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v10, v19, v10, vcc +; SI-NEXT: v_fma_f64 v[10:11], -v[10:11], v[8:9], v[16:17] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[10:11] +; SI-NEXT: v_add_f64 v[19:20], v[10:11], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v11, v11, v20, vcc +; SI-NEXT: v_cndmask_b32_e32 v10, v10, v19, vcc +; SI-NEXT: v_ldexp_f64 v[10:11], v[10:11], 26 +; SI-NEXT: s_sub_i32 s6, s6, 26 +; SI-NEXT: s_cmp_gt_i32 s6, 26 +; SI-NEXT: s_cbranch_scc1 .LBB13_5 +; SI-NEXT: ; %bb.6: ; %Flow51 +; SI-NEXT: v_mov_b32_e32 v10, v16 +; SI-NEXT: v_mov_b32_e32 v11, v17 +; SI-NEXT: .LBB13_7: ; %frem.loop_exit +; SI-NEXT: s_sub_i32 s0, s6, 25 +; SI-NEXT: v_ldexp_f64 v[10:11], v[10:11], s0 +; SI-NEXT: v_mul_f64 v[12:13], v[10:11], v[12:13] +; SI-NEXT: s_mov_b32 s0, -1 +; SI-NEXT: s_mov_b32 s1, 0x432fffff +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[12:13]|, s[0:1] +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_mov_b32_e32 v14, 0x43300000 +; SI-NEXT: v_bfi_b32 v15, s0, v14, v13 +; SI-NEXT: v_mov_b32_e32 v14, 0 +; SI-NEXT: v_add_f64 v[16:17], v[12:13], v[14:15] +; SI-NEXT: v_add_f64 v[14:15], v[16:17], -v[14:15] +; SI-NEXT: v_cndmask_b32_e32 v13, v15, v13, vcc +; SI-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc +; SI-NEXT: v_fma_f64 v[10:11], -v[12:13], v[8:9], v[10:11] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[10:11] +; SI-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc +; SI-NEXT: v_ldexp_f64 v[8:9], v[8:9], s4 +; SI-NEXT: v_bfi_b32 v9, s0, v9, v1 +; SI-NEXT: .LBB13_8: +; SI-NEXT: v_cmp_ngt_f64_e64 s[0:1], |v[2:3]|, |v[6:7]| +; SI-NEXT: s_and_b64 vcc, exec, s[0:1] +; SI-NEXT: s_cbranch_vccz .LBB13_10 +; SI-NEXT: ; %bb.9: ; %frem.else16 +; SI-NEXT: v_and_b32_e32 v10, 0x80000000, v3 +; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]| +; SI-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc +; SI-NEXT: v_cndmask_b32_e64 v10, v2, 0, vcc +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB13_11 +; SI-NEXT: s_branch .LBB13_16 +; SI-NEXT: .LBB13_10: +; SI-NEXT: ; implicit-def: $vgpr10_vgpr11 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB13_11: ; %frem.compute15 +; SI-NEXT: s_brev_b32 s5, -2 +; SI-NEXT: v_and_b32_e32 v12, 0x7fffffff, v3 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s1, 0x7ff00000 +; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[0:1] +; SI-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| +; SI-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc +; SI-NEXT: v_frexp_exp_i32_f64_e32 v12, v[2:3] +; SI-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s2, v12 +; SI-NEXT: s_cselect_b32 s3, s2, 0 +; SI-NEXT: v_ldexp_f64 v[12:13], v[10:11], 26 +; SI-NEXT: v_and_b32_e32 v14, 0x7fffffff, v7 +; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[6:7]|, s[0:1] +; SI-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[6:7]| +; SI-NEXT: v_cndmask_b32_e32 v11, v14, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v10, v6, v10, vcc +; SI-NEXT: v_frexp_exp_i32_f64_e32 v14, v[6:7] +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s0, v14 +; SI-NEXT: s_cselect_b32 s7, s0, 0 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_add_i32 s4, s7, -1 +; SI-NEXT: v_ldexp_f64 v[10:11], v[10:11], 1 +; SI-NEXT: s_not_b32 s0, s4 +; SI-NEXT: s_add_i32 s6, s0, s3 +; SI-NEXT: v_div_scale_f64 v[14:15], s[0:1], v[10:11], v[10:11], 1.0 +; SI-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] +; SI-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], 1.0 +; SI-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17] +; SI-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], 1.0 +; SI-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17] +; SI-NEXT: v_div_scale_f64 v[18:19], s[0:1], 1.0, v[10:11], 1.0 +; SI-NEXT: v_mul_f64 v[20:21], v[18:19], v[16:17] +; SI-NEXT: v_fma_f64 v[22:23], -v[14:15], v[20:21], v[18:19] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v11, v15 +; SI-NEXT: s_mov_b32 s0, 0x3ff00000 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, v19 ; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc -; SI-NEXT: s_nop 1 -; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13] -; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] -; SI-NEXT: v_readfirstlane_b32 s0, v6 -; SI-NEXT: v_readfirstlane_b32 s1, v7 -; SI-NEXT: s_bfe_u32 s8, s1, 0xb0014 -; SI-NEXT: s_addk_i32 s8, 0xfc01 -; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s8 -; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3] -; SI-NEXT: s_and_b32 s9, s1, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s8, 0 -; SI-NEXT: s_cselect_b32 s2, 0, s2 -; SI-NEXT: s_cselect_b32 s3, s9, s3 -; SI-NEXT: s_cmp_gt_i32 s8, 51 -; SI-NEXT: s_cselect_b32 s1, s1, s3 -; SI-NEXT: s_cselect_b32 s0, s0, s2 -; SI-NEXT: v_fma_f64 v[0:1], -s[0:1], v[4:5], v[0:1] -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_nop 0 +; SI-NEXT: v_div_fmas_f64 v[14:15], v[22:23], v[16:17], v[20:21] +; SI-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 +; SI-NEXT: s_cmp_lt_i32 s6, 27 +; SI-NEXT: s_cbranch_scc1 .LBB13_15 +; SI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; SI-NEXT: s_sub_i32 s0, s3, s7 +; SI-NEXT: s_add_i32 s6, s0, 26 +; SI-NEXT: s_mov_b32 s3, 0x432fffff +; SI-NEXT: v_mov_b32_e32 v20, 0x43300000 +; SI-NEXT: v_mov_b32_e32 v16, 0 +; SI-NEXT: .LBB13_13: ; %frem.loop_body23 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v19, v13 +; SI-NEXT: v_mov_b32_e32 v18, v12 +; SI-NEXT: v_mul_f64 v[12:13], v[18:19], v[14:15] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[12:13]|, s[2:3] +; SI-NEXT: v_bfi_b32 v17, s5, v20, v13 +; SI-NEXT: v_add_f64 v[21:22], v[12:13], v[16:17] +; SI-NEXT: v_add_f64 v[21:22], v[21:22], -v[16:17] +; SI-NEXT: v_cndmask_b32_e32 v13, v22, v13, vcc +; SI-NEXT: v_cndmask_b32_e32 v12, v21, v12, vcc +; SI-NEXT: v_fma_f64 v[12:13], -v[12:13], v[10:11], v[18:19] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[12:13] +; SI-NEXT: v_add_f64 v[21:22], v[12:13], v[10:11] +; SI-NEXT: v_cndmask_b32_e32 v13, v13, v22, vcc +; SI-NEXT: v_cndmask_b32_e32 v12, v12, v21, vcc +; SI-NEXT: v_ldexp_f64 v[12:13], v[12:13], 26 +; SI-NEXT: s_sub_i32 s6, s6, 26 +; SI-NEXT: s_cmp_gt_i32 s6, 26 +; SI-NEXT: s_cbranch_scc1 .LBB13_13 +; SI-NEXT: ; %bb.14: ; %Flow +; SI-NEXT: v_mov_b32_e32 v12, v18 +; SI-NEXT: v_mov_b32_e32 v13, v19 +; SI-NEXT: .LBB13_15: ; %frem.loop_exit24 +; SI-NEXT: s_sub_i32 s0, s6, 25 +; SI-NEXT: v_ldexp_f64 v[12:13], v[12:13], s0 +; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[14:15] +; SI-NEXT: s_mov_b32 s0, -1 +; SI-NEXT: s_mov_b32 s1, 0x432fffff +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[14:15]|, s[0:1] +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_mov_b32_e32 v16, 0x43300000 +; SI-NEXT: v_bfi_b32 v17, s0, v16, v15 +; SI-NEXT: v_mov_b32_e32 v16, 0 +; SI-NEXT: v_add_f64 v[18:19], v[14:15], v[16:17] +; SI-NEXT: v_add_f64 v[16:17], v[18:19], -v[16:17] +; SI-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc +; SI-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc +; SI-NEXT: v_fma_f64 v[12:13], -v[14:15], v[10:11], v[12:13] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[12:13] +; SI-NEXT: v_add_f64 v[10:11], v[12:13], v[10:11] +; SI-NEXT: v_cndmask_b32_e32 v11, v13, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc +; SI-NEXT: v_ldexp_f64 v[10:11], v[10:11], s4 +; SI-NEXT: v_bfi_b32 v11, s0, v11, v3 +; SI-NEXT: .LBB13_16: ; %Flow50 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, 0, v[4:5] +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s1, 0x7ff00000 +; SI-NEXT: v_cmp_nge_f64_e64 s[2:3], |v[0:1]|, s[0:1] +; SI-NEXT: s_and_b64 vcc, s[2:3], vcc +; SI-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, 0, v8, vcc +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, 0, v[6:7] +; SI-NEXT: v_cmp_nge_f64_e64 s[0:1], |v[2:3]|, s[0:1] +; SI-NEXT: s_and_b64 vcc, s[0:1], vcc +; SI-NEXT: v_cndmask_b32_e32 v3, v4, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s6, s2 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_mov_b32 s6, s10 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s8 -; CI-NEXT: s_mov_b32 s1, s9 -; CI-NEXT: s_mov_b32 s8, s10 -; CI-NEXT: s_mov_b32 s9, s11 -; CI-NEXT: s_mov_b32 s10, s2 -; CI-NEXT: s_mov_b32 s11, s3 -; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: s_mov_b32 s7, s11 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:64 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3] -; CI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; CI-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] -; CI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] -; CI-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| +; CI-NEXT: s_and_b64 vcc, exec, s[2:3] +; CI-NEXT: s_cbranch_vccz .LBB13_2 +; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]| +; CI-NEXT: v_and_b32_e32 v8, 0x80000000, v1 +; CI-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc +; CI-NEXT: v_cndmask_b32_e64 v8, v0, 0, vcc +; CI-NEXT: s_cbranch_execz .LBB13_3 +; CI-NEXT: s_branch .LBB13_8 +; CI-NEXT: .LBB13_2: +; CI-NEXT: ; implicit-def: $vgpr8_vgpr9 +; CI-NEXT: .LBB13_3: ; %frem.compute +; CI-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| +; CI-NEXT: v_frexp_exp_i32_f64_e32 v15, v[4:5] +; CI-NEXT: v_frexp_exp_i32_f64_e32 v14, v[0:1] +; CI-NEXT: v_ldexp_f64 v[10:11], v[8:9], 26 +; CI-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[4:5]| +; CI-NEXT: v_add_i32_e32 v16, vcc, -1, v15 +; CI-NEXT: v_not_b32_e32 v12, v16 +; CI-NEXT: v_add_i32_e32 v17, vcc, v12, v14 +; CI-NEXT: v_ldexp_f64 v[8:9], v[8:9], 1 +; CI-NEXT: v_div_scale_f64 v[12:13], s[2:3], v[8:9], v[8:9], 1.0 +; CI-NEXT: v_rcp_f64_e32 v[18:19], v[12:13] +; CI-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0 +; CI-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19] +; CI-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0 +; CI-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19] +; CI-NEXT: v_div_scale_f64 v[20:21], vcc, 1.0, v[8:9], 1.0 +; CI-NEXT: v_mul_f64 v[22:23], v[20:21], v[18:19] +; CI-NEXT: v_fma_f64 v[12:13], -v[12:13], v[22:23], v[20:21] ; CI-NEXT: s_nop 1 -; CI-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] -; CI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] -; CI-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] -; CI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; CI-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1] -; CI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] -; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; CI-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] -; CI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] -; CI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; CI-NEXT: v_div_fmas_f64 v[12:13], v[12:13], v[18:19], v[22:23] +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v17 +; CI-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB13_7 +; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: v_sub_i32_e32 v14, vcc, v14, v15 +; CI-NEXT: v_add_i32_e32 v17, vcc, 26, v14 +; CI-NEXT: .LBB13_5: ; %frem.loop_body +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v15, v11 +; CI-NEXT: v_mov_b32_e32 v14, v10 +; CI-NEXT: v_mul_f64 v[10:11], v[14:15], v[12:13] +; CI-NEXT: v_rndne_f64_e32 v[10:11], v[10:11] +; CI-NEXT: v_fma_f64 v[10:11], -v[10:11], v[8:9], v[14:15] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[10:11] +; CI-NEXT: v_add_f64 v[18:19], v[10:11], v[8:9] +; CI-NEXT: v_cndmask_b32_e32 v11, v11, v19, vcc +; CI-NEXT: v_cndmask_b32_e32 v10, v10, v18, vcc +; CI-NEXT: v_ldexp_f64 v[10:11], v[10:11], 26 +; CI-NEXT: v_subrev_i32_e32 v17, vcc, 26, v17 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v17 +; CI-NEXT: s_cbranch_vccnz .LBB13_5 +; CI-NEXT: ; %bb.6: ; %Flow51 +; CI-NEXT: v_mov_b32_e32 v10, v14 +; CI-NEXT: v_mov_b32_e32 v11, v15 +; CI-NEXT: .LBB13_7: ; %frem.loop_exit +; CI-NEXT: v_subrev_i32_e32 v14, vcc, 25, v17 +; CI-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_mul_f64 v[12:13], v[10:11], v[12:13] +; CI-NEXT: v_rndne_f64_e32 v[12:13], v[12:13] +; CI-NEXT: v_fma_f64 v[10:11], -v[12:13], v[8:9], v[10:11] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[10:11] +; CI-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; CI-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc +; CI-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc +; CI-NEXT: v_ldexp_f64 v[8:9], v[8:9], v16 +; CI-NEXT: v_bfi_b32 v9, s2, v9, v1 +; CI-NEXT: .LBB13_8: +; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, |v[6:7]| +; CI-NEXT: s_and_b64 vcc, exec, s[2:3] +; CI-NEXT: s_cbranch_vccz .LBB13_10 +; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]| +; CI-NEXT: v_and_b32_e32 v10, 0x80000000, v3 +; CI-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc +; CI-NEXT: v_cndmask_b32_e64 v10, v2, 0, vcc +; CI-NEXT: s_cbranch_execz .LBB13_11 +; CI-NEXT: s_branch .LBB13_16 +; CI-NEXT: .LBB13_10: +; CI-NEXT: ; implicit-def: $vgpr10_vgpr11 +; CI-NEXT: .LBB13_11: ; %frem.compute15 +; CI-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| +; CI-NEXT: v_frexp_exp_i32_f64_e32 v17, v[6:7] +; CI-NEXT: v_frexp_exp_i32_f64_e32 v16, v[2:3] +; CI-NEXT: v_ldexp_f64 v[12:13], v[10:11], 26 +; CI-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[6:7]| +; CI-NEXT: v_add_i32_e32 v18, vcc, -1, v17 +; CI-NEXT: v_not_b32_e32 v14, v18 +; CI-NEXT: v_add_i32_e32 v19, vcc, v14, v16 +; CI-NEXT: v_ldexp_f64 v[10:11], v[10:11], 1 +; CI-NEXT: v_div_scale_f64 v[14:15], s[2:3], v[10:11], v[10:11], 1.0 +; CI-NEXT: v_rcp_f64_e32 v[20:21], v[14:15] +; CI-NEXT: v_fma_f64 v[22:23], -v[14:15], v[20:21], 1.0 +; CI-NEXT: v_fma_f64 v[20:21], v[20:21], v[22:23], v[20:21] +; CI-NEXT: v_fma_f64 v[22:23], -v[14:15], v[20:21], 1.0 +; CI-NEXT: v_fma_f64 v[20:21], v[20:21], v[22:23], v[20:21] +; CI-NEXT: v_div_scale_f64 v[22:23], vcc, 1.0, v[10:11], 1.0 +; CI-NEXT: v_mul_f64 v[24:25], v[22:23], v[20:21] +; CI-NEXT: v_fma_f64 v[14:15], -v[14:15], v[24:25], v[22:23] ; CI-NEXT: s_nop 1 -; CI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] -; CI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] -; CI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] -; CI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] +; CI-NEXT: v_div_fmas_f64 v[14:15], v[14:15], v[20:21], v[24:25] +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v19 +; CI-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB13_15 +; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: v_sub_i32_e32 v16, vcc, v16, v17 +; CI-NEXT: v_add_i32_e32 v19, vcc, 26, v16 +; CI-NEXT: .LBB13_13: ; %frem.loop_body23 +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v17, v13 +; CI-NEXT: v_mov_b32_e32 v16, v12 +; CI-NEXT: v_mul_f64 v[12:13], v[16:17], v[14:15] +; CI-NEXT: v_rndne_f64_e32 v[12:13], v[12:13] +; CI-NEXT: v_fma_f64 v[12:13], -v[12:13], v[10:11], v[16:17] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[12:13] +; CI-NEXT: v_add_f64 v[20:21], v[12:13], v[10:11] +; CI-NEXT: v_cndmask_b32_e32 v13, v13, v21, vcc +; CI-NEXT: v_cndmask_b32_e32 v12, v12, v20, vcc +; CI-NEXT: v_ldexp_f64 v[12:13], v[12:13], 26 +; CI-NEXT: v_subrev_i32_e32 v19, vcc, 26, v19 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v19 +; CI-NEXT: s_cbranch_vccnz .LBB13_13 +; CI-NEXT: ; %bb.14: ; %Flow +; CI-NEXT: v_mov_b32_e32 v12, v16 +; CI-NEXT: v_mov_b32_e32 v13, v17 +; CI-NEXT: .LBB13_15: ; %frem.loop_exit24 +; CI-NEXT: v_subrev_i32_e32 v16, vcc, 25, v19 +; CI-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_mul_f64 v[14:15], v[12:13], v[14:15] +; CI-NEXT: v_rndne_f64_e32 v[14:15], v[14:15] +; CI-NEXT: v_fma_f64 v[12:13], -v[14:15], v[10:11], v[12:13] +; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[12:13] +; CI-NEXT: v_add_f64 v[10:11], v[12:13], v[10:11] +; CI-NEXT: v_cndmask_b32_e32 v11, v13, v11, vcc +; CI-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc +; CI-NEXT: v_ldexp_f64 v[10:11], v[10:11], v18 +; CI-NEXT: v_bfi_b32 v11, s2, v11, v3 +; CI-NEXT: .LBB13_16: ; %Flow50 +; CI-NEXT: s_mov_b32 s4, 0 +; CI-NEXT: s_mov_b32 s5, 0x7ff00000 +; CI-NEXT: v_cmp_lg_f64_e32 vcc, 0, v[4:5] +; CI-NEXT: v_cmp_nge_f64_e64 s[2:3], |v[0:1]|, s[4:5] +; CI-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; CI-NEXT: v_cmp_nge_f64_e64 s[4:5], |v[2:3]|, s[4:5] +; CI-NEXT: s_and_b64 vcc, s[2:3], vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc +; CI-NEXT: v_cndmask_b32_e32 v0, 0, v8, vcc +; CI-NEXT: v_cmp_lg_f64_e32 vcc, 0, v[6:7] +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_and_b64 vcc, s[4:5], vcc +; CI-NEXT: v_cndmask_b32_e32 v3, v4, v11, vcc +; CI-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; @@ -5335,86 +16415,341 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v8, s0 -; VI-NEXT: s_add_u32 s0, s4, 64 -; VI-NEXT: v_mov_b32_e32 v9, s1 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_add_u32 s2, s4, 64 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_addc_u32 s3, s5, 0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3] -; VI-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] -; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 -; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] -; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 -; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] -; VI-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] -; VI-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] -; VI-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] +; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] +; VI-NEXT: s_cbranch_vccz .LBB13_2 +; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]| +; VI-NEXT: v_and_b32_e32 v8, 0x80000000, v1 +; VI-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc +; VI-NEXT: v_cndmask_b32_e64 v8, v0, 0, vcc +; VI-NEXT: s_cbranch_execz .LBB13_3 +; VI-NEXT: s_branch .LBB13_8 +; VI-NEXT: .LBB13_2: +; VI-NEXT: ; implicit-def: $vgpr8_vgpr9 +; VI-NEXT: .LBB13_3: ; %frem.compute +; VI-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| +; VI-NEXT: v_frexp_exp_i32_f64_e32 v15, v[4:5] +; VI-NEXT: v_frexp_exp_i32_f64_e32 v14, v[0:1] +; VI-NEXT: v_ldexp_f64 v[10:11], v[8:9], 26 +; VI-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[4:5]| +; VI-NEXT: v_add_u32_e32 v16, vcc, -1, v15 +; VI-NEXT: v_not_b32_e32 v12, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, v12, v14 +; VI-NEXT: v_ldexp_f64 v[8:9], v[8:9], 1 +; VI-NEXT: v_div_scale_f64 v[12:13], s[2:3], v[8:9], v[8:9], 1.0 +; VI-NEXT: v_rcp_f64_e32 v[18:19], v[12:13] +; VI-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0 +; VI-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19] +; VI-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0 +; VI-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19] +; VI-NEXT: v_div_scale_f64 v[20:21], vcc, 1.0, v[8:9], 1.0 +; VI-NEXT: v_mul_f64 v[22:23], v[20:21], v[18:19] +; VI-NEXT: v_fma_f64 v[12:13], -v[12:13], v[22:23], v[20:21] ; VI-NEXT: s_nop 1 -; VI-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] -; VI-NEXT: v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3] -; VI-NEXT: v_trunc_f64_e32 v[10:11], v[10:11] -; VI-NEXT: v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3] -; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] -; VI-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0 -; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0 -; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; VI-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] -; VI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] -; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[14:15], v[12:13] +; VI-NEXT: v_div_fmas_f64 v[12:13], v[12:13], v[18:19], v[22:23] +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v17 +; VI-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB13_7 +; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: v_sub_u32_e32 v14, vcc, v14, v15 +; VI-NEXT: v_add_u32_e32 v17, vcc, 26, v14 +; VI-NEXT: .LBB13_5: ; %frem.loop_body +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v15, v11 +; VI-NEXT: v_mov_b32_e32 v14, v10 +; VI-NEXT: v_mul_f64 v[10:11], v[14:15], v[12:13] +; VI-NEXT: v_rndne_f64_e32 v[10:11], v[10:11] +; VI-NEXT: v_fma_f64 v[10:11], -v[10:11], v[8:9], v[14:15] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[10:11] +; VI-NEXT: v_add_f64 v[18:19], v[10:11], v[8:9] +; VI-NEXT: v_cndmask_b32_e32 v11, v11, v19, vcc +; VI-NEXT: v_cndmask_b32_e32 v10, v10, v18, vcc +; VI-NEXT: v_ldexp_f64 v[10:11], v[10:11], 26 +; VI-NEXT: v_subrev_u32_e32 v17, vcc, 26, v17 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v17 +; VI-NEXT: s_cbranch_vccnz .LBB13_5 +; VI-NEXT: ; %bb.6: ; %Flow51 +; VI-NEXT: v_mov_b32_e32 v10, v14 +; VI-NEXT: v_mov_b32_e32 v11, v15 +; VI-NEXT: .LBB13_7: ; %frem.loop_exit +; VI-NEXT: v_subrev_u32_e32 v14, vcc, 25, v17 +; VI-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[12:13] +; VI-NEXT: v_rndne_f64_e32 v[12:13], v[12:13] +; VI-NEXT: v_fma_f64 v[10:11], -v[12:13], v[8:9], v[10:11] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[10:11] +; VI-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc +; VI-NEXT: v_ldexp_f64 v[8:9], v[8:9], v16 +; VI-NEXT: v_bfi_b32 v9, s2, v9, v1 +; VI-NEXT: .LBB13_8: +; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, |v[6:7]| +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] +; VI-NEXT: s_cbranch_vccz .LBB13_10 +; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]| +; VI-NEXT: v_and_b32_e32 v10, 0x80000000, v3 +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc +; VI-NEXT: v_cndmask_b32_e64 v10, v2, 0, vcc +; VI-NEXT: s_cbranch_execz .LBB13_11 +; VI-NEXT: s_branch .LBB13_16 +; VI-NEXT: .LBB13_10: +; VI-NEXT: ; implicit-def: $vgpr10_vgpr11 +; VI-NEXT: .LBB13_11: ; %frem.compute15 +; VI-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| +; VI-NEXT: v_frexp_exp_i32_f64_e32 v17, v[6:7] +; VI-NEXT: v_frexp_exp_i32_f64_e32 v16, v[2:3] +; VI-NEXT: v_ldexp_f64 v[12:13], v[10:11], 26 +; VI-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[6:7]| +; VI-NEXT: v_add_u32_e32 v18, vcc, -1, v17 +; VI-NEXT: v_not_b32_e32 v14, v18 +; VI-NEXT: v_add_u32_e32 v19, vcc, v14, v16 +; VI-NEXT: v_ldexp_f64 v[10:11], v[10:11], 1 +; VI-NEXT: v_div_scale_f64 v[14:15], s[2:3], v[10:11], v[10:11], 1.0 +; VI-NEXT: v_rcp_f64_e32 v[20:21], v[14:15] +; VI-NEXT: v_fma_f64 v[22:23], -v[14:15], v[20:21], 1.0 +; VI-NEXT: v_fma_f64 v[20:21], v[20:21], v[22:23], v[20:21] +; VI-NEXT: v_fma_f64 v[22:23], -v[14:15], v[20:21], 1.0 +; VI-NEXT: v_fma_f64 v[20:21], v[20:21], v[22:23], v[20:21] +; VI-NEXT: v_div_scale_f64 v[22:23], vcc, 1.0, v[10:11], 1.0 +; VI-NEXT: v_mul_f64 v[24:25], v[22:23], v[20:21] +; VI-NEXT: v_fma_f64 v[14:15], -v[14:15], v[24:25], v[22:23] ; VI-NEXT: s_nop 1 -; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[14:15] -; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] -; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] -; VI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] -; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; VI-NEXT: v_div_fmas_f64 v[14:15], v[14:15], v[20:21], v[24:25] +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v19 +; VI-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 +; VI-NEXT: s_cbranch_vccnz .LBB13_15 +; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: v_sub_u32_e32 v16, vcc, v16, v17 +; VI-NEXT: v_add_u32_e32 v19, vcc, 26, v16 +; VI-NEXT: .LBB13_13: ; %frem.loop_body23 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_mov_b32_e32 v16, v12 +; VI-NEXT: v_mul_f64 v[12:13], v[16:17], v[14:15] +; VI-NEXT: v_rndne_f64_e32 v[12:13], v[12:13] +; VI-NEXT: v_fma_f64 v[12:13], -v[12:13], v[10:11], v[16:17] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[12:13] +; VI-NEXT: v_add_f64 v[20:21], v[12:13], v[10:11] +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v21, vcc +; VI-NEXT: v_cndmask_b32_e32 v12, v12, v20, vcc +; VI-NEXT: v_ldexp_f64 v[12:13], v[12:13], 26 +; VI-NEXT: v_subrev_u32_e32 v19, vcc, 26, v19 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v19 +; VI-NEXT: s_cbranch_vccnz .LBB13_13 +; VI-NEXT: ; %bb.14: ; %Flow +; VI-NEXT: v_mov_b32_e32 v12, v16 +; VI-NEXT: v_mov_b32_e32 v13, v17 +; VI-NEXT: .LBB13_15: ; %frem.loop_exit24 +; VI-NEXT: v_subrev_u32_e32 v16, vcc, 25, v19 +; VI-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 +; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: v_mul_f64 v[14:15], v[12:13], v[14:15] +; VI-NEXT: v_rndne_f64_e32 v[14:15], v[14:15] +; VI-NEXT: v_fma_f64 v[12:13], -v[14:15], v[10:11], v[12:13] +; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[12:13] +; VI-NEXT: v_add_f64 v[10:11], v[12:13], v[10:11] +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v11, vcc +; VI-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc +; VI-NEXT: v_ldexp_f64 v[10:11], v[10:11], v18 +; VI-NEXT: v_bfi_b32 v11, s2, v11, v3 +; VI-NEXT: .LBB13_16: ; %Flow50 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_mov_b32 s3, 0x7ff00000 +; VI-NEXT: v_cmp_lg_f64_e32 vcc, 0, v[4:5] +; VI-NEXT: v_cmp_nge_f64_e64 s[4:5], |v[0:1]|, s[2:3] +; VI-NEXT: v_mov_b32_e32 v12, 0x7ff80000 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_cmp_nge_f64_e64 s[0:1], |v[2:3]|, s[2:3] +; VI-NEXT: s_and_b64 vcc, s[4:5], vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v12, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v8, vcc +; VI-NEXT: v_cmp_lg_f64_e32 vcc, 0, v[6:7] +; VI-NEXT: s_and_b64 vcc, s[0:1], vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v12, v11, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: frem_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[8:9], s[2:3], v[6:7], v[6:7], v[2:3] -; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] -; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| +; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX9-NEXT: s_cbranch_vccz .LBB13_2 +; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]| +; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v8, v0, 0, vcc +; GFX9-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-NEXT: s_branch .LBB13_8 +; GFX9-NEXT: .LBB13_2: +; GFX9-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX9-NEXT: .LBB13_3: ; %frem.compute +; GFX9-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| +; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v15, v[4:5] +; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v14, v[0:1] +; GFX9-NEXT: v_ldexp_f64 v[10:11], v[8:9], 26 +; GFX9-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[4:5]| +; GFX9-NEXT: v_add_u32_e32 v16, -1, v15 +; GFX9-NEXT: v_not_b32_e32 v12, v16 +; GFX9-NEXT: v_add_u32_e32 v17, v12, v14 +; GFX9-NEXT: v_ldexp_f64 v[8:9], v[8:9], 1 +; GFX9-NEXT: v_div_scale_f64 v[12:13], s[2:3], v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_rcp_f64_e32 v[18:19], v[12:13] +; GFX9-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0 +; GFX9-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19] +; GFX9-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0 +; GFX9-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[20:21], vcc, 1.0, v[8:9], 1.0 +; GFX9-NEXT: v_mul_f64 v[22:23], v[20:21], v[18:19] +; GFX9-NEXT: v_fma_f64 v[12:13], -v[12:13], v[22:23], v[20:21] ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] -; GFX9-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] -; GFX9-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] -; GFX9-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; GFX9-NEXT: v_div_scale_f64 v[6:7], s[2:3], v[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] -; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] -; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX9-NEXT: v_div_fmas_f64 v[12:13], v[12:13], v[18:19], v[22:23] +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v17 +; GFX9-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB13_7 +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: v_sub_u32_e32 v14, v14, v15 +; GFX9-NEXT: v_add_u32_e32 v17, 26, v14 +; GFX9-NEXT: .LBB13_5: ; %frem.loop_body +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-NEXT: v_mul_f64 v[10:11], v[14:15], v[12:13] +; GFX9-NEXT: v_subrev_u32_e32 v17, 26, v17 +; GFX9-NEXT: v_rndne_f64_e32 v[10:11], v[10:11] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[8:9], v[14:15] +; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[10:11] +; GFX9-NEXT: v_add_f64 v[18:19], v[10:11], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v19, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v18, vcc +; GFX9-NEXT: v_ldexp_f64 v[10:11], v[10:11], 26 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 26, v17 +; GFX9-NEXT: s_cbranch_vccnz .LBB13_5 +; GFX9-NEXT: ; %bb.6: ; %Flow51 +; GFX9-NEXT: v_mov_b32_e32 v10, v14 +; GFX9-NEXT: v_mov_b32_e32 v11, v15 +; GFX9-NEXT: .LBB13_7: ; %frem.loop_exit +; GFX9-NEXT: v_subrev_u32_e32 v14, 25, v17 +; GFX9-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[12:13] +; GFX9-NEXT: v_rndne_f64_e32 v[12:13], v[12:13] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[12:13], v[8:9], v[10:11] +; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[10:11] +; GFX9-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc +; GFX9-NEXT: v_ldexp_f64 v[8:9], v[8:9], v16 +; GFX9-NEXT: v_bfi_b32 v9, s2, v9, v1 +; GFX9-NEXT: .LBB13_8: +; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, |v[6:7]| +; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX9-NEXT: s_cbranch_vccz .LBB13_10 +; GFX9-NEXT: ; %bb.9: ; %frem.else16 +; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]| +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v10, v2, 0, vcc +; GFX9-NEXT: s_cbranch_execz .LBB13_11 +; GFX9-NEXT: s_branch .LBB13_16 +; GFX9-NEXT: .LBB13_10: +; GFX9-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX9-NEXT: .LBB13_11: ; %frem.compute15 +; GFX9-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| +; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v17, v[6:7] +; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v16, v[2:3] +; GFX9-NEXT: v_ldexp_f64 v[12:13], v[10:11], 26 +; GFX9-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[6:7]| +; GFX9-NEXT: v_add_u32_e32 v18, -1, v17 +; GFX9-NEXT: v_not_b32_e32 v14, v18 +; GFX9-NEXT: v_add_u32_e32 v19, v14, v16 +; GFX9-NEXT: v_ldexp_f64 v[10:11], v[10:11], 1 +; GFX9-NEXT: v_div_scale_f64 v[14:15], s[2:3], v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_rcp_f64_e32 v[20:21], v[14:15] +; GFX9-NEXT: v_fma_f64 v[22:23], -v[14:15], v[20:21], 1.0 +; GFX9-NEXT: v_fma_f64 v[20:21], v[20:21], v[22:23], v[20:21] +; GFX9-NEXT: v_fma_f64 v[22:23], -v[14:15], v[20:21], 1.0 +; GFX9-NEXT: v_fma_f64 v[20:21], v[20:21], v[22:23], v[20:21] +; GFX9-NEXT: v_div_scale_f64 v[22:23], vcc, 1.0, v[10:11], 1.0 +; GFX9-NEXT: v_mul_f64 v[24:25], v[22:23], v[20:21] +; GFX9-NEXT: v_fma_f64 v[14:15], -v[14:15], v[24:25], v[22:23] ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] -; GFX9-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] -; GFX9-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] -; GFX9-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] -; GFX9-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-NEXT: v_div_fmas_f64 v[14:15], v[14:15], v[20:21], v[24:25] +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v19 +; GFX9-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 +; GFX9-NEXT: s_cbranch_vccnz .LBB13_15 +; GFX9-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX9-NEXT: v_sub_u32_e32 v16, v16, v17 +; GFX9-NEXT: v_add_u32_e32 v19, 26, v16 +; GFX9-NEXT: .LBB13_13: ; %frem.loop_body23 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_mov_b32_e32 v16, v12 +; GFX9-NEXT: v_mul_f64 v[12:13], v[16:17], v[14:15] +; GFX9-NEXT: v_subrev_u32_e32 v19, 26, v19 +; GFX9-NEXT: v_rndne_f64_e32 v[12:13], v[12:13] +; GFX9-NEXT: v_fma_f64 v[12:13], -v[12:13], v[10:11], v[16:17] +; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[12:13] +; GFX9-NEXT: v_add_f64 v[20:21], v[12:13], v[10:11] +; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v21, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v20, vcc +; GFX9-NEXT: v_ldexp_f64 v[12:13], v[12:13], 26 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 26, v19 +; GFX9-NEXT: s_cbranch_vccnz .LBB13_13 +; GFX9-NEXT: ; %bb.14: ; %Flow +; GFX9-NEXT: v_mov_b32_e32 v12, v16 +; GFX9-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-NEXT: .LBB13_15: ; %frem.loop_exit24 +; GFX9-NEXT: v_subrev_u32_e32 v16, 25, v19 +; GFX9-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[14:15] +; GFX9-NEXT: v_rndne_f64_e32 v[14:15], v[14:15] +; GFX9-NEXT: v_fma_f64 v[12:13], -v[14:15], v[10:11], v[12:13] +; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[12:13] +; GFX9-NEXT: v_add_f64 v[10:11], v[12:13], v[10:11] +; GFX9-NEXT: v_cndmask_b32_e32 v11, v13, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc +; GFX9-NEXT: v_ldexp_f64 v[10:11], v[10:11], v18 +; GFX9-NEXT: v_bfi_b32 v11, s2, v11, v3 +; GFX9-NEXT: .LBB13_16: ; %Flow50 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_mov_b32 s3, 0x7ff00000 +; GFX9-NEXT: v_cmp_lg_f64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: v_cmp_nge_f64_e64 s[4:5], |v[0:1]|, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX9-NEXT: v_cmp_nge_f64_e64 s[2:3], |v[2:3]|, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: s_and_b64 vcc, s[4:5], vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v8, vcc +; GFX9-NEXT: v_cmp_lg_f64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: s_and_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc +; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: frem_v2f64: @@ -5422,39 +16757,168 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v16, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[8:9], s2, v[6:7], v[6:7], v[2:3] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] -; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] -; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] -; GFX10-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[6:7], s2, v[4:5], v[4:5], v[0:1] -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] -; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] -; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] -; GFX10-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] -; GFX10-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] -; GFX10-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] -; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]| +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_vccz .LBB13_2 +; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]| +; GFX10-NEXT: v_and_b32_e32 v8, 0x80000000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v8, v0, 0, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB13_3 +; GFX10-NEXT: s_branch .LBB13_8 +; GFX10-NEXT: .LBB13_2: +; GFX10-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX10-NEXT: .LBB13_3: ; %frem.compute +; GFX10-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| +; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5] +; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1] +; GFX10-NEXT: v_ldexp_f64 v[10:11], v[8:9], 26 +; GFX10-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[4:5]| +; GFX10-NEXT: v_add_nc_u32_e32 v16, -1, v13 +; GFX10-NEXT: v_readfirstlane_b32 s3, v13 +; GFX10-NEXT: v_readfirstlane_b32 s2, v12 +; GFX10-NEXT: v_not_b32_e32 v13, v16 +; GFX10-NEXT: v_add_nc_u32_e32 v17, v13, v12 +; GFX10-NEXT: v_ldexp_f64 v[8:9], v[8:9], 1 +; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, v[8:9], v[8:9], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[12:13] +; GFX10-NEXT: v_fma_f64 v[18:19], -v[12:13], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX10-NEXT: v_fma_f64 v[18:19], -v[12:13], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX10-NEXT: v_div_scale_f64 v[18:19], vcc_lo, 1.0, v[8:9], 1.0 +; GFX10-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[12:13], v[20:21], v[18:19] +; GFX10-NEXT: v_div_fmas_f64 v[12:13], v[12:13], v[14:15], v[20:21] +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v17 +; GFX10-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 +; GFX10-NEXT: s_cbranch_vccnz .LBB13_7 +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s2, s2, 26 +; GFX10-NEXT: .LBB13_5: ; %frem.loop_body +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v15, v11 +; GFX10-NEXT: v_mov_b32_e32 v14, v10 +; GFX10-NEXT: s_sub_i32 s2, s2, 26 +; GFX10-NEXT: s_cmp_gt_i32 s2, 26 +; GFX10-NEXT: v_mul_f64 v[10:11], v[14:15], v[12:13] +; GFX10-NEXT: v_rndne_f64_e32 v[10:11], v[10:11] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[8:9], v[14:15] +; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[10:11] +; GFX10-NEXT: v_add_f64 v[17:18], v[10:11], v[8:9] +; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[10:11], v[10:11], 26 +; GFX10-NEXT: s_cbranch_scc1 .LBB13_5 +; GFX10-NEXT: ; %bb.6: ; %Flow51 +; GFX10-NEXT: v_mov_b32_e32 v10, v14 +; GFX10-NEXT: v_mov_b32_e32 v17, s2 +; GFX10-NEXT: v_mov_b32_e32 v11, v15 +; GFX10-NEXT: .LBB13_7: ; %frem.loop_exit +; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 25, v17 +; GFX10-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 +; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[12:13] +; GFX10-NEXT: v_rndne_f64_e32 v[12:13], v[12:13] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[12:13], v[8:9], v[10:11] +; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[10:11] +; GFX10-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[8:9], v[8:9], v16 +; GFX10-NEXT: v_bfi_b32 v9, 0x7fffffff, v9, v1 +; GFX10-NEXT: .LBB13_8: +; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]| +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_vccz .LBB13_10 +; GFX10-NEXT: ; %bb.9: ; %frem.else16 +; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]| +; GFX10-NEXT: v_and_b32_e32 v10, 0x80000000, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v10, v2, 0, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB13_11 +; GFX10-NEXT: s_branch .LBB13_16 +; GFX10-NEXT: .LBB13_10: +; GFX10-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX10-NEXT: .LBB13_11: ; %frem.compute15 +; GFX10-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| +; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7] +; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3] +; GFX10-NEXT: v_ldexp_f64 v[12:13], v[10:11], 26 +; GFX10-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[6:7]| +; GFX10-NEXT: v_add_nc_u32_e32 v18, -1, v15 +; GFX10-NEXT: v_readfirstlane_b32 s3, v15 +; GFX10-NEXT: v_readfirstlane_b32 s2, v14 +; GFX10-NEXT: v_not_b32_e32 v15, v18 +; GFX10-NEXT: v_add_nc_u32_e32 v19, v15, v14 +; GFX10-NEXT: v_ldexp_f64 v[10:11], v[10:11], 1 +; GFX10-NEXT: v_div_scale_f64 v[14:15], s4, v[10:11], v[10:11], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] +; GFX10-NEXT: v_fma_f64 v[20:21], -v[14:15], v[16:17], 1.0 +; GFX10-NEXT: v_fma_f64 v[16:17], v[16:17], v[20:21], v[16:17] +; GFX10-NEXT: v_fma_f64 v[20:21], -v[14:15], v[16:17], 1.0 +; GFX10-NEXT: v_fma_f64 v[16:17], v[16:17], v[20:21], v[16:17] +; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, 1.0, v[10:11], 1.0 +; GFX10-NEXT: v_mul_f64 v[22:23], v[20:21], v[16:17] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[14:15], v[22:23], v[20:21] +; GFX10-NEXT: v_div_fmas_f64 v[14:15], v[14:15], v[16:17], v[22:23] +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v19 +; GFX10-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 +; GFX10-NEXT: s_cbranch_vccnz .LBB13_15 +; GFX10-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s2, s2, 26 +; GFX10-NEXT: .LBB13_13: ; %frem.loop_body23 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v17, v13 +; GFX10-NEXT: v_mov_b32_e32 v16, v12 +; GFX10-NEXT: s_sub_i32 s2, s2, 26 +; GFX10-NEXT: s_cmp_gt_i32 s2, 26 +; GFX10-NEXT: v_mul_f64 v[12:13], v[16:17], v[14:15] +; GFX10-NEXT: v_rndne_f64_e32 v[12:13], v[12:13] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[12:13], v[10:11], v[16:17] +; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[12:13] +; GFX10-NEXT: v_add_f64 v[19:20], v[12:13], v[10:11] +; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v20, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v19, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[12:13], v[12:13], 26 +; GFX10-NEXT: s_cbranch_scc1 .LBB13_13 +; GFX10-NEXT: ; %bb.14: ; %Flow +; GFX10-NEXT: v_mov_b32_e32 v12, v16 +; GFX10-NEXT: v_mov_b32_e32 v19, s2 +; GFX10-NEXT: v_mov_b32_e32 v13, v17 +; GFX10-NEXT: .LBB13_15: ; %frem.loop_exit24 +; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 25, v19 +; GFX10-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 +; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[14:15] +; GFX10-NEXT: v_rndne_f64_e32 v[14:15], v[14:15] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[14:15], v[10:11], v[12:13] +; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[12:13] +; GFX10-NEXT: v_add_f64 v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_cndmask_b32_e32 v11, v13, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[10:11], v[10:11], v18 +; GFX10-NEXT: v_bfi_b32 v11, 0x7fffffff, v11, v3 +; GFX10-NEXT: .LBB13_16: ; %Flow50 +; GFX10-NEXT: v_cmp_lg_f64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x7ff00000, |v[0:1]| +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x7ff00000, |v[2:3]| +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v8, vcc_lo +; GFX10-NEXT: v_cmp_lg_f64_e32 vcc_lo, 0, v[6:7] +; GFX10-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7ff80000, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc_lo +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: frem_v2f64: @@ -5462,51 +16926,200 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v16, 0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v16, s[2:3] -; GFX11-NEXT: global_load_b128 v[4:7], v16, s[4:5] offset:64 +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] +; GFX11-NEXT: global_load_b128 v[4:7], v4, s[4:5] offset:64 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]| +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccz .LBB13_2 +; GFX11-NEXT: ; %bb.1: ; %frem.else +; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]| +; GFX11-NEXT: v_and_b32_e32 v8, 0x80000000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v8, v0, 0, vcc_lo +; GFX11-NEXT: s_cbranch_execz .LBB13_3 +; GFX11-NEXT: s_branch .LBB13_8 +; GFX11-NEXT: .LBB13_2: +; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX11-NEXT: .LBB13_3: ; %frem.compute +; GFX11-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| +; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5] +; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_ldexp_f64 v[10:11], v[8:9], 26 +; GFX11-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[4:5]| +; GFX11-NEXT: v_add_nc_u32_e32 v16, -1, v13 +; GFX11-NEXT: v_readfirstlane_b32 s3, v13 +; GFX11-NEXT: v_readfirstlane_b32 s2, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v13, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v17, v13, v12 +; GFX11-NEXT: v_ldexp_f64 v[8:9], v[8:9], 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_scale_f64 v[12:13], null, v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_rcp_f64_e32 v[14:15], v[12:13] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX11-NEXT: v_fma_f64 v[18:19], -v[12:13], v[14:15], 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX11-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] +; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX11-NEXT: v_fma_f64 v[18:19], -v[12:13], v[14:15], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX11-NEXT: v_div_scale_f64 v[18:19], vcc_lo, 1.0, v[8:9], 1.0 +; GFX11-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] -; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX11-NEXT: v_fma_f64 v[12:13], -v[12:13], v[20:21], v[18:19] +; GFX11-NEXT: v_div_fmas_f64 v[12:13], v[12:13], v[14:15], v[20:21] +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_7 +; GFX11-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s2, s2, 26 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB13_5: ; %frem.loop_body +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; GFX11-NEXT: s_sub_i32 s2, s2, 26 +; GFX11-NEXT: s_cmp_gt_i32 s2, 26 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] -; GFX11-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] +; GFX11-NEXT: v_mul_f64 v[10:11], v[14:15], v[12:13] +; GFX11-NEXT: v_rndne_f64_e32 v[10:11], v[10:11] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] -; GFX11-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[8:9], v[14:15] +; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[10:11] +; GFX11-NEXT: v_add_f64 v[17:18], v[10:11], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v18 :: v_dual_cndmask_b32 v10, v10, v17 +; GFX11-NEXT: v_ldexp_f64 v[10:11], v[10:11], 26 +; GFX11-NEXT: s_cbranch_scc1 .LBB13_5 +; GFX11-NEXT: ; %bb.6: ; %Flow51 +; GFX11-NEXT: v_dual_mov_b32 v17, s2 :: v_dual_mov_b32 v10, v14 +; GFX11-NEXT: v_mov_b32_e32 v11, v15 +; GFX11-NEXT: .LBB13_7: ; %frem.loop_exit +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_subrev_nc_u32_e32 v14, 25, v17 +; GFX11-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[12:13], v[10:11], v[12:13] +; GFX11-NEXT: v_rndne_f64_e32 v[12:13], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[10:11], -v[12:13], v[8:9], v[10:11] +; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[10:11] +; GFX11-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v9, v11, v9 :: v_dual_cndmask_b32 v8, v10, v8 +; GFX11-NEXT: v_ldexp_f64 v[8:9], v[8:9], v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v9, 0x7fffffff, v9, v1 +; GFX11-NEXT: .LBB13_8: +; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]| +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccz .LBB13_10 +; GFX11-NEXT: ; %bb.9: ; %frem.else16 +; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]| +; GFX11-NEXT: v_and_b32_e32 v10, 0x80000000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v10, v2, 0, vcc_lo +; GFX11-NEXT: s_cbranch_execz .LBB13_11 +; GFX11-NEXT: s_branch .LBB13_16 +; GFX11-NEXT: .LBB13_10: +; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX11-NEXT: .LBB13_11: ; %frem.compute15 +; GFX11-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| +; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7] +; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_ldexp_f64 v[12:13], v[10:11], 26 +; GFX11-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[6:7]| +; GFX11-NEXT: v_add_nc_u32_e32 v18, -1, v15 +; GFX11-NEXT: v_readfirstlane_b32 s3, v15 +; GFX11-NEXT: v_readfirstlane_b32 s2, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v15, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v19, v15, v14 +; GFX11-NEXT: v_ldexp_f64 v[10:11], v[10:11], 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_scale_f64 v[14:15], null, v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX11-NEXT: v_fma_f64 v[20:21], -v[14:15], v[16:17], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[16:17], v[16:17], v[20:21], v[16:17] +; GFX11-NEXT: v_fma_f64 v[20:21], -v[14:15], v[16:17], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[16:17], v[16:17], v[20:21], v[16:17] +; GFX11-NEXT: v_div_scale_f64 v[20:21], vcc_lo, 1.0, v[10:11], 1.0 +; GFX11-NEXT: v_mul_f64 v[22:23], v[20:21], v[16:17] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[14:15], -v[14:15], v[22:23], v[20:21] +; GFX11-NEXT: v_div_fmas_f64 v[14:15], v[14:15], v[16:17], v[22:23] +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_15 +; GFX11-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX11-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s2, s2, 26 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB13_13: ; %frem.loop_body23 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v17, v13 :: v_dual_mov_b32 v16, v12 +; GFX11-NEXT: s_sub_i32 s2, s2, 26 +; GFX11-NEXT: s_cmp_gt_i32 s2, 26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[12:13], v[16:17], v[14:15] +; GFX11-NEXT: v_rndne_f64_e32 v[12:13], v[12:13] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX11-NEXT: v_fma_f64 v[12:13], -v[12:13], v[10:11], v[16:17] +; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[12:13] +; GFX11-NEXT: v_add_f64 v[19:20], v[12:13], v[10:11] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] -; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX11-NEXT: v_dual_cndmask_b32 v13, v13, v20 :: v_dual_cndmask_b32 v12, v12, v19 +; GFX11-NEXT: v_ldexp_f64 v[12:13], v[12:13], 26 +; GFX11-NEXT: s_cbranch_scc1 .LBB13_13 +; GFX11-NEXT: ; %bb.14: ; %Flow +; GFX11-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v12, v16 +; GFX11-NEXT: v_mov_b32_e32 v13, v17 +; GFX11-NEXT: .LBB13_15: ; %frem.loop_exit24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_subrev_nc_u32_e32 v16, 25, v19 +; GFX11-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] -; GFX11-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] +; GFX11-NEXT: v_mul_f64 v[14:15], v[12:13], v[14:15] +; GFX11-NEXT: v_rndne_f64_e32 v[14:15], v[14:15] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] -; GFX11-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] -; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] +; GFX11-NEXT: v_fma_f64 v[12:13], -v[14:15], v[10:11], v[12:13] +; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[12:13] +; GFX11-NEXT: v_add_f64 v[10:11], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v11, v13, v11 :: v_dual_cndmask_b32 v10, v12, v10 +; GFX11-NEXT: v_ldexp_f64 v[10:11], v[10:11], v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, v11, v3 +; GFX11-NEXT: .LBB13_16: ; %Flow50 +; GFX11-NEXT: v_cmp_lg_f64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x7ff00000, |v[0:1]| +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x7ff00000, |v[2:3]| +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v8, vcc_lo +; GFX11-NEXT: v_cmp_lg_f64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7ff80000, v11, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc_lo +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_v2f64: @@ -5514,50 +17127,198 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_clause 0x1 ; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-NEXT: v_mov_b32_e32 v16, 0 +; GFX1150-NEXT: v_mov_b32_e32 v4, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b128 v[0:3], v16, s[2:3] -; GFX1150-NEXT: global_load_b128 v[4:7], v16, s[4:5] offset:64 +; GFX1150-NEXT: global_load_b128 v[0:3], v4, s[2:3] +; GFX1150-NEXT: global_load_b128 v[4:7], v4, s[4:5] offset:64 ; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX1150-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]| +; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX1150-NEXT: s_cbranch_vccz .LBB13_2 +; GFX1150-NEXT: ; %bb.1: ; %frem.else +; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]| +; GFX1150-NEXT: v_and_b32_e32 v8, 0x80000000, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc_lo +; GFX1150-NEXT: v_cndmask_b32_e64 v8, v0, 0, vcc_lo +; GFX1150-NEXT: s_cbranch_execz .LBB13_3 +; GFX1150-NEXT: s_branch .LBB13_8 +; GFX1150-NEXT: .LBB13_2: +; GFX1150-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1150-NEXT: .LBB13_3: ; %frem.compute +; GFX1150-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| +; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5] +; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1150-NEXT: v_ldexp_f64 v[10:11], v[8:9], 26 +; GFX1150-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[4:5]| +; GFX1150-NEXT: v_add_nc_u32_e32 v16, -1, v13 +; GFX1150-NEXT: v_readfirstlane_b32 s3, v13 +; GFX1150-NEXT: v_readfirstlane_b32 s2, v12 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_not_b32_e32 v13, v16 +; GFX1150-NEXT: v_add_nc_u32_e32 v17, v13, v12 +; GFX1150-NEXT: v_ldexp_f64 v[8:9], v[8:9], 1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX1150-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX1150-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] -; GFX1150-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX1150-NEXT: v_div_scale_f64 v[12:13], null, v[8:9], v[8:9], 1.0 +; GFX1150-NEXT: v_rcp_f64_e32 v[14:15], v[12:13] +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[18:19], -v[12:13], v[14:15], 1.0 +; GFX1150-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] -; GFX1150-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX1150-NEXT: v_fma_f64 v[18:19], -v[12:13], v[14:15], 1.0 +; GFX1150-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX1150-NEXT: v_div_scale_f64 v[18:19], vcc_lo, 1.0, v[8:9], 1.0 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] -; GFX1150-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; GFX1150-NEXT: v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1] -; GFX1150-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX1150-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX1150-NEXT: v_fma_f64 v[12:13], -v[12:13], v[20:21], v[18:19] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-NEXT: v_div_fmas_f64 v[12:13], v[12:13], v[14:15], v[20:21] +; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v17 +; GFX1150-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 +; GFX1150-NEXT: s_cbranch_vccnz .LBB13_7 +; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-NEXT: s_sub_i32 s2, s2, s3 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_add_i32 s2, s2, 26 +; GFX1150-NEXT: .p2align 6 +; GFX1150-NEXT: .LBB13_5: ; %frem.loop_body +; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; GFX1150-NEXT: s_sub_i32 s2, s2, 26 +; GFX1150-NEXT: s_cmp_gt_i32 s2, 26 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f64 v[10:11], v[14:15], v[12:13] +; GFX1150-NEXT: v_rndne_f64_e32 v[10:11], v[10:11] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[10:11], -v[10:11], v[8:9], v[14:15] +; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[10:11] +; GFX1150-NEXT: v_add_f64 v[17:18], v[10:11], v[8:9] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_dual_cndmask_b32 v11, v11, v18 :: v_dual_cndmask_b32 v10, v10, v17 +; GFX1150-NEXT: v_ldexp_f64 v[10:11], v[10:11], 26 +; GFX1150-NEXT: s_cbranch_scc1 .LBB13_5 +; GFX1150-NEXT: ; %bb.6: ; %Flow51 +; GFX1150-NEXT: v_dual_mov_b32 v17, s2 :: v_dual_mov_b32 v10, v14 +; GFX1150-NEXT: v_mov_b32_e32 v11, v15 +; GFX1150-NEXT: .LBB13_7: ; %frem.loop_exit +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_subrev_nc_u32_e32 v14, 25, v17 +; GFX1150-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f64 v[12:13], v[10:11], v[12:13] +; GFX1150-NEXT: v_rndne_f64_e32 v[12:13], v[12:13] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[10:11], -v[12:13], v[8:9], v[10:11] +; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[10:11] +; GFX1150-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_dual_cndmask_b32 v9, v11, v9 :: v_dual_cndmask_b32 v8, v10, v8 +; GFX1150-NEXT: v_ldexp_f64 v[8:9], v[8:9], v16 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_bfi_b32 v9, 0x7fffffff, v9, v1 +; GFX1150-NEXT: .LBB13_8: +; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]| +; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX1150-NEXT: s_cbranch_vccz .LBB13_10 +; GFX1150-NEXT: ; %bb.9: ; %frem.else16 +; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]| +; GFX1150-NEXT: v_and_b32_e32 v10, 0x80000000, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc_lo +; GFX1150-NEXT: v_cndmask_b32_e64 v10, v2, 0, vcc_lo +; GFX1150-NEXT: s_cbranch_execz .LBB13_11 +; GFX1150-NEXT: s_branch .LBB13_16 +; GFX1150-NEXT: .LBB13_10: +; GFX1150-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX1150-NEXT: .LBB13_11: ; %frem.compute15 +; GFX1150-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| +; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7] +; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1150-NEXT: v_ldexp_f64 v[12:13], v[10:11], 26 +; GFX1150-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[6:7]| +; GFX1150-NEXT: v_add_nc_u32_e32 v18, -1, v15 +; GFX1150-NEXT: v_readfirstlane_b32 s3, v15 +; GFX1150-NEXT: v_readfirstlane_b32 s2, v14 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_not_b32_e32 v15, v18 +; GFX1150-NEXT: v_add_nc_u32_e32 v19, v15, v14 +; GFX1150-NEXT: v_ldexp_f64 v[10:11], v[10:11], 1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_scale_f64 v[14:15], null, v[10:11], v[10:11], 1.0 +; GFX1150-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; GFX1150-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX1150-NEXT: v_fma_f64 v[20:21], -v[14:15], v[16:17], 1.0 +; GFX1150-NEXT: v_fma_f64 v[16:17], v[16:17], v[20:21], v[16:17] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; GFX1150-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; GFX1150-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX1150-NEXT: v_fma_f64 v[20:21], -v[14:15], v[16:17], 1.0 +; GFX1150-NEXT: v_fma_f64 v[16:17], v[16:17], v[20:21], v[16:17] +; GFX1150-NEXT: v_div_scale_f64 v[20:21], vcc_lo, 1.0, v[10:11], 1.0 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] -; GFX1150-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX1150-NEXT: v_mul_f64 v[22:23], v[20:21], v[16:17] +; GFX1150-NEXT: v_fma_f64 v[14:15], -v[14:15], v[22:23], v[20:21] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-NEXT: v_div_fmas_f64 v[14:15], v[14:15], v[16:17], v[22:23] +; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v19 +; GFX1150-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 +; GFX1150-NEXT: s_cbranch_vccnz .LBB13_15 +; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1150-NEXT: s_sub_i32 s2, s2, s3 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_add_i32 s2, s2, 26 +; GFX1150-NEXT: .p2align 6 +; GFX1150-NEXT: .LBB13_13: ; %frem.loop_body23 +; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1150-NEXT: v_dual_mov_b32 v17, v13 :: v_dual_mov_b32 v16, v12 +; GFX1150-NEXT: s_sub_i32 s2, s2, 26 +; GFX1150-NEXT: s_cmp_gt_i32 s2, 26 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] -; GFX1150-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] +; GFX1150-NEXT: v_mul_f64 v[12:13], v[16:17], v[14:15] +; GFX1150-NEXT: v_rndne_f64_e32 v[12:13], v[12:13] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] -; GFX1150-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] -; GFX1150-NEXT: global_store_b128 v16, v[0:3], s[0:1] +; GFX1150-NEXT: v_fma_f64 v[12:13], -v[12:13], v[10:11], v[16:17] +; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[12:13] +; GFX1150-NEXT: v_add_f64 v[19:20], v[12:13], v[10:11] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_dual_cndmask_b32 v13, v13, v20 :: v_dual_cndmask_b32 v12, v12, v19 +; GFX1150-NEXT: v_ldexp_f64 v[12:13], v[12:13], 26 +; GFX1150-NEXT: s_cbranch_scc1 .LBB13_13 +; GFX1150-NEXT: ; %bb.14: ; %Flow +; GFX1150-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v12, v16 +; GFX1150-NEXT: v_mov_b32_e32 v13, v17 +; GFX1150-NEXT: .LBB13_15: ; %frem.loop_exit24 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_subrev_nc_u32_e32 v16, 25, v19 +; GFX1150-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f64 v[14:15], v[12:13], v[14:15] +; GFX1150-NEXT: v_rndne_f64_e32 v[14:15], v[14:15] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[12:13], -v[14:15], v[10:11], v[12:13] +; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[12:13] +; GFX1150-NEXT: v_add_f64 v[10:11], v[12:13], v[10:11] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_dual_cndmask_b32 v11, v13, v11 :: v_dual_cndmask_b32 v10, v12, v10 +; GFX1150-NEXT: v_ldexp_f64 v[10:11], v[10:11], v18 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_bfi_b32 v11, 0x7fffffff, v11, v3 +; GFX1150-NEXT: .LBB13_16: ; %Flow50 +; GFX1150-NEXT: v_cmp_lg_f64_e32 vcc_lo, 0, v[4:5] +; GFX1150-NEXT: v_cmp_nle_f64_e64 s2, 0x7ff00000, |v[0:1]| +; GFX1150-NEXT: v_mov_b32_e32 v4, 0 +; GFX1150-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX1150-NEXT: v_cmp_nle_f64_e64 s2, 0x7ff00000, |v[2:3]| +; GFX1150-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v9, vcc_lo +; GFX1150-NEXT: v_cndmask_b32_e32 v0, 0, v8, vcc_lo +; GFX1150-NEXT: v_cmp_lg_f64_e32 vcc_lo, 0, v[6:7] +; GFX1150-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX1150-NEXT: v_cndmask_b32_e32 v3, 0x7ff80000, v11, vcc_lo +; GFX1150-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc_lo +; GFX1150-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX1150-NEXT: s_endpgm ; ; GFX1200-LABEL: frem_v2f64: @@ -5565,51 +17326,208 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_clause 0x1 ; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1200-NEXT: v_mov_b32_e32 v16, 0 +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 ; GFX1200-NEXT: s_clause 0x1 -; GFX1200-NEXT: global_load_b128 v[0:3], v16, s[2:3] -; GFX1200-NEXT: global_load_b128 v[4:7], v16, s[4:5] offset:64 +; GFX1200-NEXT: global_load_b128 v[0:3], v4, s[2:3] +; GFX1200-NEXT: global_load_b128 v[4:7], v4, s[4:5] offset:64 ; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1200-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX1200-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX1200-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]| +; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX1200-NEXT: s_cbranch_vccz .LBB13_2 +; GFX1200-NEXT: ; %bb.1: ; %frem.else +; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]| +; GFX1200-NEXT: v_and_b32_e32 v8, 0x80000000, v1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e64 v8, v0, 0, vcc_lo +; GFX1200-NEXT: s_cbranch_execz .LBB13_3 +; GFX1200-NEXT: s_branch .LBB13_8 +; GFX1200-NEXT: .LBB13_2: +; GFX1200-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1200-NEXT: .LBB13_3: ; %frem.compute +; GFX1200-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| +; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5] +; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1200-NEXT: v_ldexp_f64 v[10:11], v[8:9], 26 +; GFX1200-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[4:5]| +; GFX1200-NEXT: v_add_nc_u32_e32 v16, -1, v13 +; GFX1200-NEXT: v_readfirstlane_b32 s3, v13 +; GFX1200-NEXT: v_readfirstlane_b32 s2, v12 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_not_b32_e32 v13, v16 +; GFX1200-NEXT: v_add_nc_u32_e32 v17, v13, v12 +; GFX1200-NEXT: v_ldexp_f64 v[8:9], v[8:9], 1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX1200-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX1200-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] -; GFX1200-NEXT: v_mul_f64_e32 v[14:15], v[12:13], v[10:11] +; GFX1200-NEXT: v_div_scale_f64 v[12:13], null, v[8:9], v[8:9], 1.0 +; GFX1200-NEXT: v_rcp_f64_e32 v[14:15], v[12:13] +; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[18:19], -v[12:13], v[14:15], 1.0 +; GFX1200-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] -; GFX1200-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX1200-NEXT: v_fma_f64 v[18:19], -v[12:13], v[14:15], 1.0 +; GFX1200-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX1200-NEXT: v_div_scale_f64 v[18:19], vcc_lo, 1.0, v[8:9], 1.0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] -; GFX1200-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; GFX1200-NEXT: v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1] -; GFX1200-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX1200-NEXT: v_mul_f64_e32 v[20:21], v[18:19], v[14:15] +; GFX1200-NEXT: v_fma_f64 v[12:13], -v[12:13], v[20:21], v[18:19] +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-NEXT: v_div_fmas_f64 v[12:13], v[12:13], v[14:15], v[20:21] +; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v17 +; GFX1200-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 +; GFX1200-NEXT: s_cbranch_vccnz .LBB13_7 +; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-NEXT: s_sub_co_i32 s2, s2, s3 +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_add_co_i32 s2, s2, 26 +; GFX1200-NEXT: .LBB13_5: ; %frem.loop_body +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1200-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 +; GFX1200-NEXT: s_sub_co_i32 s2, s2, 26 +; GFX1200-NEXT: s_cmp_gt_i32 s2, 26 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f64_e32 v[10:11], v[14:15], v[12:13] +; GFX1200-NEXT: v_rndne_f64_e32 v[10:11], v[10:11] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[10:11], -v[10:11], v[8:9], v[14:15] +; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[10:11] +; GFX1200-NEXT: v_add_f64_e32 v[17:18], v[10:11], v[8:9] +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_dual_cndmask_b32 v11, v11, v18 :: v_dual_cndmask_b32 v10, v10, v17 +; GFX1200-NEXT: v_ldexp_f64 v[10:11], v[10:11], 26 +; GFX1200-NEXT: s_cbranch_scc1 .LBB13_5 +; GFX1200-NEXT: ; %bb.6: ; %Flow51 +; GFX1200-NEXT: v_dual_mov_b32 v17, s2 :: v_dual_mov_b32 v10, v14 +; GFX1200-NEXT: v_mov_b32_e32 v11, v15 +; GFX1200-NEXT: .LBB13_7: ; %frem.loop_exit +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_subrev_nc_u32_e32 v14, 25, v17 +; GFX1200-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f64_e32 v[12:13], v[10:11], v[12:13] +; GFX1200-NEXT: v_rndne_f64_e32 v[12:13], v[12:13] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f64 v[10:11], -v[12:13], v[8:9], v[10:11] +; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[10:11] +; GFX1200-NEXT: v_add_f64_e32 v[8:9], v[10:11], v[8:9] +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_dual_cndmask_b32 v9, v11, v9 :: v_dual_cndmask_b32 v8, v10, v8 +; GFX1200-NEXT: v_ldexp_f64 v[8:9], v[8:9], v16 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_bfi_b32 v9, 0x7fffffff, v9, v1 +; GFX1200-NEXT: .LBB13_8: +; GFX1200-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]| +; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cbranch_vccz .LBB13_10 +; GFX1200-NEXT: ; %bb.9: ; %frem.else16 +; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]| +; GFX1200-NEXT: v_and_b32_e32 v10, 0x80000000, v3 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e64 v10, v2, 0, vcc_lo +; GFX1200-NEXT: s_cbranch_execz .LBB13_11 +; GFX1200-NEXT: s_branch .LBB13_16 +; GFX1200-NEXT: .LBB13_10: +; GFX1200-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX1200-NEXT: .LBB13_11: ; %frem.compute15 +; GFX1200-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| +; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7] +; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1200-NEXT: v_ldexp_f64 v[12:13], v[10:11], 26 +; GFX1200-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[6:7]| +; GFX1200-NEXT: v_add_nc_u32_e32 v18, -1, v15 +; GFX1200-NEXT: v_readfirstlane_b32 s3, v15 +; GFX1200-NEXT: v_readfirstlane_b32 s2, v14 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_not_b32_e32 v15, v18 +; GFX1200-NEXT: v_add_nc_u32_e32 v19, v15, v14 +; GFX1200-NEXT: v_ldexp_f64 v[10:11], v[10:11], 1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_div_scale_f64 v[14:15], null, v[10:11], v[10:11], 1.0 +; GFX1200-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] ; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; GFX1200-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX1200-NEXT: v_fma_f64 v[20:21], -v[14:15], v[16:17], 1.0 +; GFX1200-NEXT: v_fma_f64 v[16:17], v[16:17], v[20:21], v[16:17] ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; GFX1200-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; GFX1200-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX1200-NEXT: v_fma_f64 v[20:21], -v[14:15], v[16:17], 1.0 +; GFX1200-NEXT: v_fma_f64 v[16:17], v[16:17], v[20:21], v[16:17] +; GFX1200-NEXT: v_div_scale_f64 v[20:21], vcc_lo, 1.0, v[10:11], 1.0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_mul_f64_e32 v[12:13], v[10:11], v[8:9] -; GFX1200-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX1200-NEXT: v_mul_f64_e32 v[22:23], v[20:21], v[16:17] +; GFX1200-NEXT: v_fma_f64 v[14:15], -v[14:15], v[22:23], v[20:21] ; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-NEXT: v_div_fmas_f64 v[14:15], v[14:15], v[16:17], v[22:23] +; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v19 +; GFX1200-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 +; GFX1200-NEXT: s_cbranch_vccnz .LBB13_15 +; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1200-NEXT: s_sub_co_i32 s2, s2, s3 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_i32 s2, s2, 26 +; GFX1200-NEXT: .LBB13_13: ; %frem.loop_body23 +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1200-NEXT: v_dual_mov_b32 v17, v13 :: v_dual_mov_b32 v16, v12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_sub_co_i32 s2, s2, 26 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_gt_i32 s2, 26 +; GFX1200-NEXT: v_mul_f64_e32 v[12:13], v[16:17], v[14:15] ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] -; GFX1200-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] +; GFX1200-NEXT: v_rndne_f64_e32 v[12:13], v[12:13] +; GFX1200-NEXT: v_fma_f64 v[12:13], -v[12:13], v[10:11], v[16:17] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[12:13] +; GFX1200-NEXT: v_add_f64_e32 v[19:20], v[12:13], v[10:11] +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_dual_cndmask_b32 v13, v13, v20 :: v_dual_cndmask_b32 v12, v12, v19 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_ldexp_f64 v[12:13], v[12:13], 26 +; GFX1200-NEXT: s_cbranch_scc1 .LBB13_13 +; GFX1200-NEXT: ; %bb.14: ; %Flow +; GFX1200-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v12, v16 +; GFX1200-NEXT: v_mov_b32_e32 v13, v17 +; GFX1200-NEXT: .LBB13_15: ; %frem.loop_exit24 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_subrev_nc_u32_e32 v16, 25, v19 +; GFX1200-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f64_e32 v[14:15], v[12:13], v[14:15] +; GFX1200-NEXT: v_rndne_f64_e32 v[14:15], v[14:15] ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] -; GFX1200-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] -; GFX1200-NEXT: global_store_b128 v16, v[0:3], s[0:1] +; GFX1200-NEXT: v_fma_f64 v[12:13], -v[14:15], v[10:11], v[12:13] +; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[12:13] +; GFX1200-NEXT: v_add_f64_e32 v[10:11], v[12:13], v[10:11] +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_dual_cndmask_b32 v11, v13, v11 :: v_dual_cndmask_b32 v10, v12, v10 +; GFX1200-NEXT: v_ldexp_f64 v[10:11], v[10:11], v18 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_bfi_b32 v11, 0x7fffffff, v11, v3 +; GFX1200-NEXT: .LBB13_16: ; %Flow50 +; GFX1200-NEXT: v_cmp_lg_f64_e32 vcc_lo, 0, v[4:5] +; GFX1200-NEXT: v_cmp_nle_f64_e64 s2, 0x7ff00000, |v[0:1]| +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX1200-NEXT: v_cmp_nle_f64_e64 s2, 0x7ff00000, |v[2:3]| +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v9, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e32 v0, 0, v8, vcc_lo +; GFX1200-NEXT: v_cmp_lg_f64_e32 vcc_lo, 0, v[6:7] +; GFX1200-NEXT: s_and_b32 vcc_lo, s2, vcc_lo +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cndmask_b32_e32 v3, 0x7ff80000, v11, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc_lo +; GFX1200-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX1200-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll index 6df3d255244d2..4288eef5f3a68 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -enable-new-pm -mtriple=amdgcn--amdhsa -O0 -print-pipeline-passes < %s 2>&1 \ ; RUN: | FileCheck -check-prefix=GCN-O0 %s @@ -8,12 +9,16 @@ ; RUN: | FileCheck -check-prefix=GCN-O3 %s -; GCN-O0: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O0: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O2: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O2: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O3: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O3: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) define void @empty() { ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN-O0: {{.*}} +; GCN-O2: {{.*}} +; GCN-O3: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 097154ed23ede..2a76d83cd7dac 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -2570,21 +2570,81 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x28 ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 -; GFX1032-NEXT: v_div_scale_f32 v4, vcc_lo, v0, s0, v0 +; GFX1032-NEXT: v_cmp_ngt_f32_e64 s1, v0, |s0| +; GFX1032-NEXT: s_and_saveexec_b32 s2, s1 +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s2 +; GFX1032-NEXT: ; %bb.1: ; %frem.else +; GFX1032-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, v0 +; GFX1032-NEXT: v_cmp_eq_f32_e64 vcc_lo, v0, |s0| +; GFX1032-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo +; GFX1032-NEXT: ; implicit-def: $vgpr0 +; GFX1032-NEXT: ; %bb.2: ; %Flow13 +; GFX1032-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032-NEXT: s_cbranch_execz .LBB51_8 +; GFX1032-NEXT: ; %bb.3: ; %frem.compute +; GFX1032-NEXT: v_frexp_mant_f32_e64 v1, |s0| +; GFX1032-NEXT: v_frexp_exp_i32_f32_e32 v7, v0 +; GFX1032-NEXT: v_frexp_mant_f32_e32 v8, v0 +; GFX1032-NEXT: v_ldexp_f32 v1, v1, 1 +; GFX1032-NEXT: v_div_scale_f32 v2, s2, v1, v1, 1.0 +; GFX1032-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v1, 1.0 +; GFX1032-NEXT: v_rcp_f32_e32 v3, v2 +; GFX1032-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX1032-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX1032-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX1032-NEXT: v_fma_f32 v6, -v2, v4, v5 +; GFX1032-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX1032-NEXT: v_frexp_exp_i32_f32_e32 v6, s0 +; GFX1032-NEXT: v_fma_f32 v5, -v2, v4, v5 +; GFX1032-NEXT: v_add_nc_u32_e32 v2, -1, v6 +; GFX1032-NEXT: v_div_fmas_f32 v3, v5, v3, v4 +; GFX1032-NEXT: v_xad_u32 v4, v2, -1, v7 +; GFX1032-NEXT: v_ldexp_f32 v5, v8, 12 +; GFX1032-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 12, v4 +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB51_7 +; GFX1032-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1032-NEXT: v_sub_nc_u32_e32 v4, v7, v6 +; GFX1032-NEXT: s_mov_b32 s3, 0 +; GFX1032-NEXT: v_add_nc_u32_e32 v4, 12, v4 +; GFX1032-NEXT: .LBB51_5: ; %frem.loop_body +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_mov_b32_e32 v6, v5 +; GFX1032-NEXT: v_add_nc_u32_e32 v4, -12, v4 +; GFX1032-NEXT: v_mul_f32_e32 v5, v6, v3 +; GFX1032-NEXT: v_rndne_f32_e32 v5, v5 +; GFX1032-NEXT: v_fma_f32 v5, -v5, v1, v6 +; GFX1032-NEXT: v_add_f32_e32 v7, v5, v1 +; GFX1032-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1032-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4 +; GFX1032-NEXT: v_ldexp_f32 v5, v5, 12 +; GFX1032-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execnz .LBB51_5 +; GFX1032-NEXT: ; %bb.6: ; %Flow +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: v_mov_b32_e32 v5, v6 +; GFX1032-NEXT: .LBB51_7: ; %Flow12 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: v_add_nc_u32_e32 v4, -11, v4 +; GFX1032-NEXT: v_ldexp_f32 v4, v5, v4 +; GFX1032-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX1032-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1032-NEXT: v_fma_f32 v3, -v3, v1, v4 +; GFX1032-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX1032-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX1032-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX1032-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v0 +; GFX1032-NEXT: .LBB51_8: ; %Flow14 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: v_cmp_lg_f32_e64 vcc_lo, s0, 0 ; GFX1032-NEXT: s_brev_b32 s1, 1 -; GFX1032-NEXT: v_rcp_f32_e32 v2, v1 -; GFX1032-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX1032-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX1032-NEXT: v_mul_f32_e32 v3, v4, v2 -; GFX1032-NEXT: v_fma_f32 v5, -v1, v3, v4 -; GFX1032-NEXT: v_fmac_f32_e32 v3, v5, v2 -; GFX1032-NEXT: v_fma_f32 v1, -v1, v3, v4 -; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v3 -; GFX1032-NEXT: v_div_fixup_f32 v1, v1, s0, v0 -; GFX1032-NEXT: v_trunc_f32_e32 v1, v1 -; GFX1032-NEXT: v_fma_f32 v0, -v1, s0, v0 +; GFX1032-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v1, vcc_lo ; GFX1032-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1 ; GFX1032-NEXT: v_cmp_nlg_f32_e32 vcc_lo, 0, v0 @@ -2593,29 +2653,89 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 -; GFX1032-NEXT: ; %bb.1: ; %if.then +; GFX1032-NEXT: ; %bb.9: ; %if.then ; GFX1032-NEXT: ; divergent unreachable -; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1032-NEXT: ; %bb.10: ; %UnifiedReturnBlock ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: fcmp64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x28 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 -; GFX1064-NEXT: v_rcp_f32_e32 v2, v1 -; GFX1064-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX1064-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX1064-NEXT: v_div_scale_f32 v3, vcc, v0, s2, v0 -; GFX1064-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX1064-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX1064-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX1064-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX1064-NEXT: v_div_fixup_f32 v1, v1, s2, v0 -; GFX1064-NEXT: v_trunc_f32_e32 v1, v1 -; GFX1064-NEXT: v_fma_f32 v0, -v1, s2, v0 +; GFX1064-NEXT: v_cmp_ngt_f32_e64 s[0:1], v0, |s6| +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[2:3] +; GFX1064-NEXT: ; %bb.1: ; %frem.else +; GFX1064-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, v0 +; GFX1064-NEXT: v_cmp_eq_f32_e64 vcc, v0, |s6| +; GFX1064-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: ; %bb.2: ; %Flow13 +; GFX1064-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB51_8 +; GFX1064-NEXT: ; %bb.3: ; %frem.compute +; GFX1064-NEXT: v_frexp_mant_f32_e64 v1, |s6| +; GFX1064-NEXT: v_frexp_exp_i32_f32_e32 v7, v0 +; GFX1064-NEXT: v_frexp_mant_f32_e32 v8, v0 +; GFX1064-NEXT: v_ldexp_f32 v1, v1, 1 +; GFX1064-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, 1.0 +; GFX1064-NEXT: v_div_scale_f32 v5, vcc, 1.0, v1, 1.0 +; GFX1064-NEXT: v_rcp_f32_e32 v3, v2 +; GFX1064-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX1064-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX1064-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX1064-NEXT: v_fma_f32 v6, -v2, v4, v5 +; GFX1064-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX1064-NEXT: v_frexp_exp_i32_f32_e32 v6, s6 +; GFX1064-NEXT: v_fma_f32 v5, -v2, v4, v5 +; GFX1064-NEXT: v_add_nc_u32_e32 v2, -1, v6 +; GFX1064-NEXT: v_div_fmas_f32 v3, v5, v3, v4 +; GFX1064-NEXT: v_xad_u32 v4, v2, -1, v7 +; GFX1064-NEXT: v_ldexp_f32 v5, v8, 12 +; GFX1064-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 12, v4 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB51_7 +; GFX1064-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1064-NEXT: v_sub_nc_u32_e32 v4, v7, v6 +; GFX1064-NEXT: s_mov_b64 s[4:5], 0 +; GFX1064-NEXT: v_add_nc_u32_e32 v4, 12, v4 +; GFX1064-NEXT: .LBB51_5: ; %frem.loop_body +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_mov_b32_e32 v6, v5 +; GFX1064-NEXT: v_add_nc_u32_e32 v4, -12, v4 +; GFX1064-NEXT: v_mul_f32_e32 v5, v6, v3 +; GFX1064-NEXT: v_rndne_f32_e32 v5, v5 +; GFX1064-NEXT: v_fma_f32 v5, -v5, v1, v6 +; GFX1064-NEXT: v_add_f32_e32 v7, v5, v1 +; GFX1064-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; GFX1064-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX1064-NEXT: v_cmp_gt_i32_e32 vcc, 13, v4 +; GFX1064-NEXT: v_ldexp_f32 v5, v5, 12 +; GFX1064-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_cbranch_execnz .LBB51_5 +; GFX1064-NEXT: ; %bb.6: ; %Flow +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v6 +; GFX1064-NEXT: .LBB51_7: ; %Flow12 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: v_add_nc_u32_e32 v4, -11, v4 +; GFX1064-NEXT: v_ldexp_f32 v4, v5, v4 +; GFX1064-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX1064-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1064-NEXT: v_fma_f32 v3, -v3, v1, v4 +; GFX1064-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX1064-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; GFX1064-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX1064-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX1064-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v0 +; GFX1064-NEXT: .LBB51_8: ; %Flow14 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: v_cmp_lg_f32_e64 vcc, s6, 0 +; GFX1064-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v1, vcc ; GFX1064-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_lshr_b64 s[0:1], vcc, 1 ; GFX1064-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0 @@ -2625,9 +2745,9 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; GFX1064-NEXT: ; %bb.1: ; %if.then +; GFX1064-NEXT: ; %bb.9: ; %if.then ; GFX1064-NEXT: ; divergent unreachable -; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1064-NEXT: ; %bb.10: ; %UnifiedReturnBlock ; GFX1064-NEXT: s_endpgm entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -2756,20 +2876,80 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) { ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x28 ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 -; GFX1032-NEXT: v_rcp_f32_e32 v2, v1 -; GFX1032-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX1032-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX1032-NEXT: v_div_scale_f32 v3, vcc_lo, v0, s0, v0 -; GFX1032-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX1032-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX1032-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX1032-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX1032-NEXT: v_div_fixup_f32 v1, v1, s0, v0 -; GFX1032-NEXT: v_trunc_f32_e32 v1, v1 -; GFX1032-NEXT: v_fma_f32 v0, -v1, s0, v0 +; GFX1032-NEXT: v_cmp_ngt_f32_e64 s1, v0, |s0| +; GFX1032-NEXT: s_and_saveexec_b32 s2, s1 +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s2 +; GFX1032-NEXT: ; %bb.1: ; %frem.else +; GFX1032-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, v0 +; GFX1032-NEXT: v_cmp_eq_f32_e64 vcc_lo, v0, |s0| +; GFX1032-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo +; GFX1032-NEXT: ; implicit-def: $vgpr0 +; GFX1032-NEXT: ; %bb.2: ; %Flow13 +; GFX1032-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032-NEXT: s_cbranch_execz .LBB53_8 +; GFX1032-NEXT: ; %bb.3: ; %frem.compute +; GFX1032-NEXT: v_frexp_mant_f32_e64 v1, |s0| +; GFX1032-NEXT: v_frexp_exp_i32_f32_e32 v7, v0 +; GFX1032-NEXT: v_frexp_mant_f32_e32 v8, v0 +; GFX1032-NEXT: v_ldexp_f32 v1, v1, 1 +; GFX1032-NEXT: v_div_scale_f32 v2, s2, v1, v1, 1.0 +; GFX1032-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v1, 1.0 +; GFX1032-NEXT: v_rcp_f32_e32 v3, v2 +; GFX1032-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX1032-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX1032-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX1032-NEXT: v_fma_f32 v6, -v2, v4, v5 +; GFX1032-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX1032-NEXT: v_frexp_exp_i32_f32_e32 v6, s0 +; GFX1032-NEXT: v_fma_f32 v5, -v2, v4, v5 +; GFX1032-NEXT: v_add_nc_u32_e32 v2, -1, v6 +; GFX1032-NEXT: v_div_fmas_f32 v3, v5, v3, v4 +; GFX1032-NEXT: v_xad_u32 v4, v2, -1, v7 +; GFX1032-NEXT: v_ldexp_f32 v5, v8, 12 +; GFX1032-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 12, v4 +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB53_7 +; GFX1032-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1032-NEXT: v_sub_nc_u32_e32 v4, v7, v6 +; GFX1032-NEXT: s_mov_b32 s3, 0 +; GFX1032-NEXT: v_add_nc_u32_e32 v4, 12, v4 +; GFX1032-NEXT: .LBB53_5: ; %frem.loop_body +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_mov_b32_e32 v6, v5 +; GFX1032-NEXT: v_add_nc_u32_e32 v4, -12, v4 +; GFX1032-NEXT: v_mul_f32_e32 v5, v6, v3 +; GFX1032-NEXT: v_rndne_f32_e32 v5, v5 +; GFX1032-NEXT: v_fma_f32 v5, -v5, v1, v6 +; GFX1032-NEXT: v_add_f32_e32 v7, v5, v1 +; GFX1032-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 +; GFX1032-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4 +; GFX1032-NEXT: v_ldexp_f32 v5, v5, 12 +; GFX1032-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execnz .LBB53_5 +; GFX1032-NEXT: ; %bb.6: ; %Flow +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: v_mov_b32_e32 v5, v6 +; GFX1032-NEXT: .LBB53_7: ; %Flow12 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: v_add_nc_u32_e32 v4, -11, v4 +; GFX1032-NEXT: v_ldexp_f32 v4, v5, v4 +; GFX1032-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX1032-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1032-NEXT: v_fma_f32 v3, -v3, v1, v4 +; GFX1032-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX1032-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX1032-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX1032-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v0 +; GFX1032-NEXT: .LBB53_8: ; %Flow14 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: v_cmp_lg_f32_e64 vcc_lo, s0, 0 +; GFX1032-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v1, vcc_lo ; GFX1032-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1 ; GFX1032-NEXT: v_cmp_nlg_f32_e32 vcc_lo, 0, v0 @@ -2779,29 +2959,89 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) { ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 -; GFX1032-NEXT: ; %bb.1: ; %if.then +; GFX1032-NEXT: ; %bb.9: ; %if.then ; GFX1032-NEXT: ; divergent unreachable -; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1032-NEXT: ; %bb.10: ; %UnifiedReturnBlock ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: fcmp32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x28 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 -; GFX1064-NEXT: v_rcp_f32_e32 v2, v1 -; GFX1064-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX1064-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX1064-NEXT: v_div_scale_f32 v3, vcc, v0, s2, v0 -; GFX1064-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX1064-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX1064-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX1064-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX1064-NEXT: v_div_fixup_f32 v1, v1, s2, v0 -; GFX1064-NEXT: v_trunc_f32_e32 v1, v1 -; GFX1064-NEXT: v_fma_f32 v0, -v1, s2, v0 +; GFX1064-NEXT: v_cmp_ngt_f32_e64 s[0:1], v0, |s6| +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[2:3] +; GFX1064-NEXT: ; %bb.1: ; %frem.else +; GFX1064-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, v0 +; GFX1064-NEXT: v_cmp_eq_f32_e64 vcc, v0, |s6| +; GFX1064-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: ; %bb.2: ; %Flow13 +; GFX1064-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB53_8 +; GFX1064-NEXT: ; %bb.3: ; %frem.compute +; GFX1064-NEXT: v_frexp_mant_f32_e64 v1, |s6| +; GFX1064-NEXT: v_frexp_exp_i32_f32_e32 v7, v0 +; GFX1064-NEXT: v_frexp_mant_f32_e32 v8, v0 +; GFX1064-NEXT: v_ldexp_f32 v1, v1, 1 +; GFX1064-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, 1.0 +; GFX1064-NEXT: v_div_scale_f32 v5, vcc, 1.0, v1, 1.0 +; GFX1064-NEXT: v_rcp_f32_e32 v3, v2 +; GFX1064-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX1064-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX1064-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX1064-NEXT: v_fma_f32 v6, -v2, v4, v5 +; GFX1064-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX1064-NEXT: v_frexp_exp_i32_f32_e32 v6, s6 +; GFX1064-NEXT: v_fma_f32 v5, -v2, v4, v5 +; GFX1064-NEXT: v_add_nc_u32_e32 v2, -1, v6 +; GFX1064-NEXT: v_div_fmas_f32 v3, v5, v3, v4 +; GFX1064-NEXT: v_xad_u32 v4, v2, -1, v7 +; GFX1064-NEXT: v_ldexp_f32 v5, v8, 12 +; GFX1064-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 12, v4 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB53_7 +; GFX1064-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1064-NEXT: v_sub_nc_u32_e32 v4, v7, v6 +; GFX1064-NEXT: s_mov_b64 s[4:5], 0 +; GFX1064-NEXT: v_add_nc_u32_e32 v4, 12, v4 +; GFX1064-NEXT: .LBB53_5: ; %frem.loop_body +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_mov_b32_e32 v6, v5 +; GFX1064-NEXT: v_add_nc_u32_e32 v4, -12, v4 +; GFX1064-NEXT: v_mul_f32_e32 v5, v6, v3 +; GFX1064-NEXT: v_rndne_f32_e32 v5, v5 +; GFX1064-NEXT: v_fma_f32 v5, -v5, v1, v6 +; GFX1064-NEXT: v_add_f32_e32 v7, v5, v1 +; GFX1064-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; GFX1064-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX1064-NEXT: v_cmp_gt_i32_e32 vcc, 13, v4 +; GFX1064-NEXT: v_ldexp_f32 v5, v5, 12 +; GFX1064-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_cbranch_execnz .LBB53_5 +; GFX1064-NEXT: ; %bb.6: ; %Flow +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v6 +; GFX1064-NEXT: .LBB53_7: ; %Flow12 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: v_add_nc_u32_e32 v4, -11, v4 +; GFX1064-NEXT: v_ldexp_f32 v4, v5, v4 +; GFX1064-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX1064-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1064-NEXT: v_fma_f32 v3, -v3, v1, v4 +; GFX1064-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX1064-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; GFX1064-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX1064-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX1064-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v0 +; GFX1064-NEXT: .LBB53_8: ; %Flow14 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: v_cmp_lg_f32_e64 vcc, s6, 0 +; GFX1064-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v1, vcc ; GFX1064-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_lshr_b32 s0, vcc_lo, 1 ; GFX1064-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0 @@ -2811,9 +3051,9 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) { ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; GFX1064-NEXT: ; %bb.1: ; %if.then +; GFX1064-NEXT: ; %bb.9: ; %if.then ; GFX1064-NEXT: ; divergent unreachable -; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1064-NEXT: ; %bb.10: ; %UnifiedReturnBlock ; GFX1064-NEXT: s_endpgm entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll index 90d994909264a..661f67d4989c4 100644 --- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll +++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll @@ -20,9 +20,9 @@ ; LAXX-NEXT: Target Pass Configuration ; LAXX-NEXT: Machine Module Information ; LAXX-NEXT: Target Transform Information +; LAXX-NEXT: Assumption Cache Tracker ; LAXX-NEXT: Type-Based Alias Analysis ; LAXX-NEXT: Scoped NoAlias Alias Analysis -; LAXX-NEXT: Assumption Cache Tracker ; LAXX-NEXT: Profile summary info ; LAXX-NEXT: Create Garbage Collector Module Metadata ; LAXX-NEXT: Machine Branch Probability Analysis diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll index 8d155bd57df13..1e3204dfc999f 100644 --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -16,9 +16,9 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information +; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis -; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Machine Branch Probability Analysis diff --git a/llvm/test/Transforms/ExpandFp/AMDGPU/frem-inf.ll b/llvm/test/Transforms/ExpandFp/AMDGPU/frem-inf.ll new file mode 100644 index 0000000000000..18a9339217fe1 --- /dev/null +++ b/llvm/test/Transforms/ExpandFp/AMDGPU/frem-inf.ll @@ -0,0 +1,52 @@ +; RUN: opt -mtriple=amdgcn -passes="expand-fp" %s -S -o - | FileCheck --check-prefixes CHECK %s +; RUN: opt -mtriple=amdgcn -passes="expand-fp" %s -S -o - | FileCheck --check-prefixes CHECK,OPT1 %s + +; Check the handling of potentially infinite numerators in the frem +; expansion at different optimization levels and with different +; fast-math flags. + +; CHECK-LABEL: define float @frem_x_maybe_inf(float %x, float %y) +; CHECK: 2: +; CHECK: [[FABS:%.*]] = call float @llvm.fabs.f32(float %x) +; CHECK: [[FCMP:%.*]] = fcmp ult float [[FABS]], 0x7FF0000000000000 +; CHECK-NEXT: %ret = select i1 [[FCMP]], float %{{.*}}, float 0x7FF8000000000000 +; CHECK-NEXT: ret float %ret +; CHECK-LABEL: } +define float @frem_x_maybe_inf(float %x, float %y) { + %ret = frem float %x, %y + ret float %ret +} + +; OPT1-LABEL: define float @frem_x_assumed_non_inf(float %x, float %y) +; OPT1: 2: +; OPT1-NOT: call float @llvm.fabs.f32(float %x) +; OPT1-NOT: fcmp ult float [[FABS]], 0x7FF0000000000000 +; OPT1: %ret = select i1 true, float %{{.*}}, float 0x7FF8000000000000 +; OPT1-NEXT: ret float %ret +; OPT1-LABEL: } +; OPT0-LABEL: define float @frem_x_assumed_non_inf(float %x, float %y) +; OPT0: 2: +; OPT0: [[FABS:%.*]] = call float @llvm.fabs.f32(float %x) +; OPT0: [[FCMP:%.*]] = fcmp ult float [[FABS]], 0x7FF0000000000000 +; OPT0-NEXT: %ret = select i1 [[FCMP]], float %{{.*}}, float 0x7FF8000000000000 +; OPT0-NEXT: ret float %ret +; OPT0-LABEL: } +define float @frem_x_assumed_non_inf(float %x, float %y) { + %absx = call float @llvm.fabs.f32(float %x) + %noninf = fcmp ult float %absx, 0x7FF0000000000000 + call void @llvm.assume(i1 %noninf) + %ret = frem float %x, %y + ret float %ret +} + +; CHECK-LABEL: define float @frem_ninf(float %x, float %y) +; CHECK: 2: +; CHECK-NOT: call float @llvm.fabs.f32(float %x) +; CHECK-NOT: fcmp ult float [[FABS]], 0x7FF0000000000000 +; CHECK: %ret = select ninf i1 true, float %{{.*}}, float 0x7FF8000000000000 +; CHECK-NEXT: ret float %ret +; CHECK-LABEL: } +define float @frem_ninf(float %x, float %y) { + %ret = frem ninf float %x, %y + ret float %ret +} diff --git a/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll b/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll new file mode 100644 index 0000000000000..accbbbcadd1ed --- /dev/null +++ b/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll @@ -0,0 +1,1372 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn -passes="expand-fp" %s -S -o - | FileCheck %s + +define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; CHECK-LABEL: define amdgpu_kernel void @frem_f16( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]]) { +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr half, ptr addrspace(1) [[IN2]], i32 4 +; CHECK-NEXT: [[R0:%.*]] = load half, ptr addrspace(1) [[IN1]], align 4 +; CHECK-NEXT: [[R1:%.*]] = load half, ptr addrspace(1) [[GEP2]], align 4 +; CHECK-NEXT: [[AX:%.*]] = call half @llvm.fabs.f16(half [[R0]]) +; CHECK-NEXT: [[AY:%.*]] = call half @llvm.fabs.f16(half [[R1]]) +; CHECK-NEXT: [[AX1:%.*]] = fpext half [[AX]] to float +; CHECK-NEXT: [[AY2:%.*]] = fpext half [[AY]] to float +; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt float [[AX1]], [[AY2]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] +; CHECK: [[BB2:.*]]: +; CHECK-NEXT: [[RET:%.*]] = phi half [ [[TMP25:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP16:%.*]], %[[FREM_ELSE]] ] +; CHECK-NEXT: [[TMP3:%.*]] = fcmp ueq half [[R1]], 0xH0000 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], half 0xH7E00, half [[RET]] +; CHECK-NEXT: [[TMP5:%.*]] = call half @llvm.fabs.f16(half [[R0]]) +; CHECK-NEXT: [[TMP6:%.*]] = fcmp ult half [[TMP5]], 0xH7C00 +; CHECK-NEXT: [[R2:%.*]] = select i1 [[TMP6]], half [[TMP4]], half 0xH7E00 +; CHECK-NEXT: store half [[R2]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; CHECK: [[FREM_COMPUTE]]: +; CHECK-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX1]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP7]], 1 +; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP9]], 1 +; CHECK-NEXT: [[AX3:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP8]], i32 11) +; CHECK-NEXT: [[TMP10:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY2]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP10]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP10]], 1 +; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP12]], 1 +; CHECK-NEXT: [[AY4:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP11]], i32 1) +; CHECK-NEXT: [[NB:%.*]] = sub i32 [[EX]], [[EY]] +; CHECK-NEXT: [[AYINV:%.*]] = fdiv float 1.000000e+00, [[AY4]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[NB]], 11 +; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]] +; CHECK: [[FREM_ELSE]]: +; CHECK-NEXT: [[TMP14:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[R0]]) +; CHECK-NEXT: [[TMP15:%.*]] = fcmp oeq float [[AX1]], [[AY2]] +; CHECK-NEXT: [[TMP16]] = select i1 [[TMP15]], half [[TMP14]], half [[R0]] +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[FREM_LOOP_BODY]]: +; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX3]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[TMP17:%.*]] = fmul float [[AX_LOOP_PHI]], [[AYINV]] +; CHECK-NEXT: [[Q:%.*]] = call float @llvm.rint.f32(float [[TMP17]]) +; CHECK-NEXT: [[TMP18:%.*]] = fneg float [[Q]] +; CHECK-NEXT: [[AX5:%.*]] = call float @llvm.fma.f32(float [[TMP18]], float [[AY4]], float [[AX_LOOP_PHI]]) +; CHECK-NEXT: [[CLT:%.*]] = fcmp olt float [[AX5]], 0.000000e+00 +; CHECK-NEXT: [[AXP:%.*]] = fadd float [[AX5]], [[AY4]] +; CHECK-NEXT: [[AX6:%.*]] = select i1 [[CLT]], float [[AXP]], float [[AX5]] +; CHECK-NEXT: [[AX_UPDATE]] = call float @llvm.ldexp.f32.i32(float [[AX6]], i32 11) +; CHECK-NEXT: [[NB_UPDATE]] = sub i32 [[NB_IV]], 11 +; CHECK-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[NB_IV]], 11 +; CHECK-NEXT: br i1 [[TMP19]], label %[[FREM_LOOP_BODY]], label %[[FREM_LOOP_EXIT]] +; CHECK: [[FREM_LOOP_EXIT]]: +; CHECK-NEXT: [[AX_EXIT_PHI:%.*]] = phi float [ [[AX3]], %[[FREM_COMPUTE]] ], [ [[AX_LOOP_PHI]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[NB_EXIT_PHI:%.*]] = phi i32 [ [[NB_IV]], %[[FREM_LOOP_BODY]] ], [ [[NB]], %[[FREM_COMPUTE]] ] +; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[NB_EXIT_PHI]], 11 +; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP20]], 1 +; CHECK-NEXT: [[AX7:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX_EXIT_PHI]], i32 [[TMP21]]) +; CHECK-NEXT: [[TMP22:%.*]] = fmul float [[AX7]], [[AYINV]] +; CHECK-NEXT: [[Q8:%.*]] = call float @llvm.rint.f32(float [[TMP22]]) +; CHECK-NEXT: [[TMP23:%.*]] = fneg float [[Q8]] +; CHECK-NEXT: [[AX9:%.*]] = call float @llvm.fma.f32(float [[TMP23]], float [[AY4]], float [[AX7]]) +; CHECK-NEXT: [[CLT10:%.*]] = fcmp olt float [[AX9]], 0.000000e+00 +; CHECK-NEXT: [[AXP11:%.*]] = fadd float [[AX9]], [[AY4]] +; CHECK-NEXT: [[AX12:%.*]] = select i1 [[CLT10]], float [[AXP11]], float [[AX9]] +; CHECK-NEXT: [[AX13:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX12]], i32 [[EY]]) +; CHECK-NEXT: [[TMP24:%.*]] = fptrunc float [[AX13]] to half +; CHECK-NEXT: [[TMP25]] = call half @llvm.copysign.f16(half [[TMP24]], half [[R0]]) +; CHECK-NEXT: br label %[[BB2]] +; + ptr addrspace(1) %in2) { + %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 + %r0 = load half, ptr addrspace(1) %in1, align 4 + %r1 = load half, ptr addrspace(1) %gep2, align 4 + %r2 = frem half %r0, %r1 + store half %r2, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; CHECK-LABEL: define amdgpu_kernel void @fast_frem_f16( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]]) { +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr half, ptr addrspace(1) [[IN2]], i32 4 +; CHECK-NEXT: [[R0:%.*]] = load half, ptr addrspace(1) [[IN1]], align 4 +; CHECK-NEXT: [[R1:%.*]] = load half, ptr addrspace(1) [[GEP2]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fdiv half [[R0]], [[R1]] +; CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.trunc.f16(half [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = fneg half [[TMP2]] +; CHECK-NEXT: [[R2:%.*]] = call half @llvm.fma.f16(half [[TMP3]], half [[R1]], half [[R0]]) +; CHECK-NEXT: store half [[R2]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; + ptr addrspace(1) %in2) { + %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 + %r0 = load half, ptr addrspace(1) %in1, align 4 + %r1 = load half, ptr addrspace(1) %gep2, align 4 + %r2 = frem fast half %r0, %r1 + store half %r2, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; CHECK-LABEL: define amdgpu_kernel void @unsafe_frem_f16( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]]) { +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr half, ptr addrspace(1) [[IN2]], i32 4 +; CHECK-NEXT: [[R0:%.*]] = load half, ptr addrspace(1) [[IN1]], align 4 +; CHECK-NEXT: [[R1:%.*]] = load half, ptr addrspace(1) [[GEP2]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fdiv half [[R0]], [[R1]] +; CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.trunc.f16(half [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = fneg half [[TMP2]] +; CHECK-NEXT: [[R2:%.*]] = call half @llvm.fma.f16(half [[TMP3]], half [[R1]], half [[R0]]) +; CHECK-NEXT: store half [[R2]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; + ptr addrspace(1) %in2) { + %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 + %r0 = load half, ptr addrspace(1) %in1, align 4 + %r1 = load half, ptr addrspace(1) %gep2, align 4 + %r2 = frem afn half %r0, %r1 + store half %r2, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; CHECK-LABEL: define amdgpu_kernel void @frem_f32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]]) { +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr float, ptr addrspace(1) [[IN2]], i32 4 +; CHECK-NEXT: [[R0:%.*]] = load float, ptr addrspace(1) [[IN1]], align 4 +; CHECK-NEXT: [[R1:%.*]] = load float, ptr addrspace(1) [[GEP2]], align 4 +; CHECK-NEXT: [[AX:%.*]] = call float @llvm.fabs.f32(float [[R0]]) +; CHECK-NEXT: [[AY:%.*]] = call float @llvm.fabs.f32(float [[R1]]) +; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt float [[AX]], [[AY]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] +; CHECK: [[BB2:.*]]: +; CHECK-NEXT: [[RET:%.*]] = phi float [ [[TMP24:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP16:%.*]], %[[FREM_ELSE]] ] +; CHECK-NEXT: [[TMP3:%.*]] = fcmp ueq float [[R1]], 0.000000e+00 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], float 0x7FF8000000000000, float [[RET]] +; CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.fabs.f32(float [[R0]]) +; CHECK-NEXT: [[TMP6:%.*]] = fcmp ult float [[TMP5]], 0x7FF0000000000000 +; CHECK-NEXT: [[R2:%.*]] = select i1 [[TMP6]], float [[TMP4]], float 0x7FF8000000000000 +; CHECK-NEXT: store float [[R2]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; CHECK: [[FREM_COMPUTE]]: +; CHECK-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP7]], 1 +; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP9]], 1 +; CHECK-NEXT: [[AX1:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP8]], i32 12) +; CHECK-NEXT: [[TMP10:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP10]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP10]], 1 +; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP12]], 1 +; CHECK-NEXT: [[AY2:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP11]], i32 1) +; CHECK-NEXT: [[NB:%.*]] = sub i32 [[EX]], [[EY]] +; CHECK-NEXT: [[AYINV:%.*]] = fdiv float 1.000000e+00, [[AY2]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[NB]], 12 +; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]] +; CHECK: [[FREM_ELSE]]: +; CHECK-NEXT: [[TMP14:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[R0]]) +; CHECK-NEXT: [[TMP15:%.*]] = fcmp oeq float [[AX]], [[AY]] +; CHECK-NEXT: [[TMP16]] = select i1 [[TMP15]], float [[TMP14]], float [[R0]] +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[FREM_LOOP_BODY]]: +; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[TMP17:%.*]] = fmul float [[AX_LOOP_PHI]], [[AYINV]] +; CHECK-NEXT: [[Q:%.*]] = call float @llvm.rint.f32(float [[TMP17]]) +; CHECK-NEXT: [[TMP18:%.*]] = fneg float [[Q]] +; CHECK-NEXT: [[AX3:%.*]] = call float @llvm.fma.f32(float [[TMP18]], float [[AY2]], float [[AX_LOOP_PHI]]) +; CHECK-NEXT: [[CLT:%.*]] = fcmp olt float [[AX3]], 0.000000e+00 +; CHECK-NEXT: [[AXP:%.*]] = fadd float [[AX3]], [[AY2]] +; CHECK-NEXT: [[AX4:%.*]] = select i1 [[CLT]], float [[AXP]], float [[AX3]] +; CHECK-NEXT: [[AX_UPDATE]] = call float @llvm.ldexp.f32.i32(float [[AX4]], i32 12) +; CHECK-NEXT: [[NB_UPDATE]] = sub i32 [[NB_IV]], 12 +; CHECK-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[NB_IV]], 12 +; CHECK-NEXT: br i1 [[TMP19]], label %[[FREM_LOOP_BODY]], label %[[FREM_LOOP_EXIT]] +; CHECK: [[FREM_LOOP_EXIT]]: +; CHECK-NEXT: [[AX_EXIT_PHI:%.*]] = phi float [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_LOOP_PHI]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[NB_EXIT_PHI:%.*]] = phi i32 [ [[NB_IV]], %[[FREM_LOOP_BODY]] ], [ [[NB]], %[[FREM_COMPUTE]] ] +; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[NB_EXIT_PHI]], 12 +; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP20]], 1 +; CHECK-NEXT: [[AX5:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX_EXIT_PHI]], i32 [[TMP21]]) +; CHECK-NEXT: [[TMP22:%.*]] = fmul float [[AX5]], [[AYINV]] +; CHECK-NEXT: [[Q6:%.*]] = call float @llvm.rint.f32(float [[TMP22]]) +; CHECK-NEXT: [[TMP23:%.*]] = fneg float [[Q6]] +; CHECK-NEXT: [[AX7:%.*]] = call float @llvm.fma.f32(float [[TMP23]], float [[AY2]], float [[AX5]]) +; CHECK-NEXT: [[CLT8:%.*]] = fcmp olt float [[AX7]], 0.000000e+00 +; CHECK-NEXT: [[AXP9:%.*]] = fadd float [[AX7]], [[AY2]] +; CHECK-NEXT: [[AX10:%.*]] = select i1 [[CLT8]], float [[AXP9]], float [[AX7]] +; CHECK-NEXT: [[AX11:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX10]], i32 [[EY]]) +; CHECK-NEXT: [[TMP24]] = call float @llvm.copysign.f32(float [[AX11]], float [[R0]]) +; CHECK-NEXT: br label %[[BB2]] +; + ptr addrspace(1) %in2) { + %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 + %r0 = load float, ptr addrspace(1) %in1, align 4 + %r1 = load float, ptr addrspace(1) %gep2, align 4 + %r2 = frem float %r0, %r1 + store float %r2, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; CHECK-LABEL: define amdgpu_kernel void @fast_frem_f32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]]) { +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr float, ptr addrspace(1) [[IN2]], i32 4 +; CHECK-NEXT: [[R0:%.*]] = load float, ptr addrspace(1) [[IN1]], align 4 +; CHECK-NEXT: [[R1:%.*]] = load float, ptr addrspace(1) [[GEP2]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fdiv float [[R0]], [[R1]] +; CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.trunc.f32(float [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = fneg float [[TMP2]] +; CHECK-NEXT: [[R2:%.*]] = call float @llvm.fma.f32(float [[TMP3]], float [[R1]], float [[R0]]) +; CHECK-NEXT: store float [[R2]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; + ptr addrspace(1) %in2) { + %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 + %r0 = load float, ptr addrspace(1) %in1, align 4 + %r1 = load float, ptr addrspace(1) %gep2, align 4 + %r2 = frem fast float %r0, %r1 + store float %r2, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; CHECK-LABEL: define amdgpu_kernel void @unsafe_frem_f32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]]) { +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr float, ptr addrspace(1) [[IN2]], i32 4 +; CHECK-NEXT: [[R0:%.*]] = load float, ptr addrspace(1) [[IN1]], align 4 +; CHECK-NEXT: [[R1:%.*]] = load float, ptr addrspace(1) [[GEP2]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fdiv float [[R0]], [[R1]] +; CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.trunc.f32(float [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = fneg float [[TMP2]] +; CHECK-NEXT: [[R2:%.*]] = call float @llvm.fma.f32(float [[TMP3]], float [[R1]], float [[R0]]) +; CHECK-NEXT: store float [[R2]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; + ptr addrspace(1) %in2) { + %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 + %r0 = load float, ptr addrspace(1) %in1, align 4 + %r1 = load float, ptr addrspace(1) %gep2, align 4 + %r2 = frem afn float %r0, %r1 + store float %r2, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; CHECK-LABEL: define amdgpu_kernel void @frem_f64( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]]) { +; CHECK-NEXT: [[R0:%.*]] = load double, ptr addrspace(1) [[IN1]], align 8 +; CHECK-NEXT: [[R1:%.*]] = load double, ptr addrspace(1) [[IN2]], align 8 +; CHECK-NEXT: [[AX:%.*]] = call double @llvm.fabs.f64(double [[R0]]) +; CHECK-NEXT: [[AY:%.*]] = call double @llvm.fabs.f64(double [[R1]]) +; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt double [[AX]], [[AY]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] +; CHECK: [[BB2:.*]]: +; CHECK-NEXT: [[RET:%.*]] = phi double [ [[TMP24:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP16:%.*]], %[[FREM_ELSE]] ] +; CHECK-NEXT: [[TMP3:%.*]] = fcmp ueq double [[R1]], 0.000000e+00 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], double 0x7FF8000000000000, double [[RET]] +; CHECK-NEXT: [[TMP5:%.*]] = call double @llvm.fabs.f64(double [[R0]]) +; CHECK-NEXT: [[TMP6:%.*]] = fcmp ult double [[TMP5]], 0x7FF0000000000000 +; CHECK-NEXT: [[R2:%.*]] = select i1 [[TMP6]], double [[TMP4]], double 0x7FF8000000000000 +; CHECK-NEXT: store double [[R2]], ptr addrspace(1) [[OUT]], align 8 +; CHECK-NEXT: ret void +; CHECK: [[FREM_COMPUTE]]: +; CHECK-NEXT: [[TMP7:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AX]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { double, i32 } [[TMP7]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { double, i32 } [[TMP7]], 1 +; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP9]], 1 +; CHECK-NEXT: [[AX1:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP8]], i32 26) +; CHECK-NEXT: [[TMP10:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AY]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { double, i32 } [[TMP10]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { double, i32 } [[TMP10]], 1 +; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP12]], 1 +; CHECK-NEXT: [[AY2:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP11]], i32 1) +; CHECK-NEXT: [[NB:%.*]] = sub i32 [[EX]], [[EY]] +; CHECK-NEXT: [[AYINV:%.*]] = fdiv double 1.000000e+00, [[AY2]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[NB]], 26 +; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]] +; CHECK: [[FREM_ELSE]]: +; CHECK-NEXT: [[TMP14:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[R0]]) +; CHECK-NEXT: [[TMP15:%.*]] = fcmp oeq double [[AX]], [[AY]] +; CHECK-NEXT: [[TMP16]] = select i1 [[TMP15]], double [[TMP14]], double [[R0]] +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[FREM_LOOP_BODY]]: +; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi double [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[TMP17:%.*]] = fmul double [[AX_LOOP_PHI]], [[AYINV]] +; CHECK-NEXT: [[Q:%.*]] = call double @llvm.rint.f64(double [[TMP17]]) +; CHECK-NEXT: [[TMP18:%.*]] = fneg double [[Q]] +; CHECK-NEXT: [[AX3:%.*]] = call double @llvm.fma.f64(double [[TMP18]], double [[AY2]], double [[AX_LOOP_PHI]]) +; CHECK-NEXT: [[CLT:%.*]] = fcmp olt double [[AX3]], 0.000000e+00 +; CHECK-NEXT: [[AXP:%.*]] = fadd double [[AX3]], [[AY2]] +; CHECK-NEXT: [[AX4:%.*]] = select i1 [[CLT]], double [[AXP]], double [[AX3]] +; CHECK-NEXT: [[AX_UPDATE]] = call double @llvm.ldexp.f64.i32(double [[AX4]], i32 26) +; CHECK-NEXT: [[NB_UPDATE]] = sub i32 [[NB_IV]], 26 +; CHECK-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[NB_IV]], 26 +; CHECK-NEXT: br i1 [[TMP19]], label %[[FREM_LOOP_BODY]], label %[[FREM_LOOP_EXIT]] +; CHECK: [[FREM_LOOP_EXIT]]: +; CHECK-NEXT: [[AX_EXIT_PHI:%.*]] = phi double [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_LOOP_PHI]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[NB_EXIT_PHI:%.*]] = phi i32 [ [[NB_IV]], %[[FREM_LOOP_BODY]] ], [ [[NB]], %[[FREM_COMPUTE]] ] +; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[NB_EXIT_PHI]], 26 +; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP20]], 1 +; CHECK-NEXT: [[AX5:%.*]] = call double @llvm.ldexp.f64.i32(double [[AX_EXIT_PHI]], i32 [[TMP21]]) +; CHECK-NEXT: [[TMP22:%.*]] = fmul double [[AX5]], [[AYINV]] +; CHECK-NEXT: [[Q6:%.*]] = call double @llvm.rint.f64(double [[TMP22]]) +; CHECK-NEXT: [[TMP23:%.*]] = fneg double [[Q6]] +; CHECK-NEXT: [[AX7:%.*]] = call double @llvm.fma.f64(double [[TMP23]], double [[AY2]], double [[AX5]]) +; CHECK-NEXT: [[CLT8:%.*]] = fcmp olt double [[AX7]], 0.000000e+00 +; CHECK-NEXT: [[AXP9:%.*]] = fadd double [[AX7]], [[AY2]] +; CHECK-NEXT: [[AX10:%.*]] = select i1 [[CLT8]], double [[AXP9]], double [[AX7]] +; CHECK-NEXT: [[AX11:%.*]] = call double @llvm.ldexp.f64.i32(double [[AX10]], i32 [[EY]]) +; CHECK-NEXT: [[TMP24]] = call double @llvm.copysign.f64(double [[AX11]], double [[R0]]) +; CHECK-NEXT: br label %[[BB2]] +; + ptr addrspace(1) %in2) { + %r0 = load double, ptr addrspace(1) %in1, align 8 + %r1 = load double, ptr addrspace(1) %in2, align 8 + %r2 = frem double %r0, %r1 + store double %r2, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; CHECK-LABEL: define amdgpu_kernel void @fast_frem_f64( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]]) { +; CHECK-NEXT: [[R0:%.*]] = load double, ptr addrspace(1) [[IN1]], align 8 +; CHECK-NEXT: [[R1:%.*]] = load double, ptr addrspace(1) [[IN2]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = fdiv double [[R0]], [[R1]] +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.trunc.f64(double [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = fneg double [[TMP2]] +; CHECK-NEXT: [[R2:%.*]] = call double @llvm.fma.f64(double [[TMP3]], double [[R1]], double [[R0]]) +; CHECK-NEXT: store double [[R2]], ptr addrspace(1) [[OUT]], align 8 +; CHECK-NEXT: ret void +; + ptr addrspace(1) %in2) { + %r0 = load double, ptr addrspace(1) %in1, align 8 + %r1 = load double, ptr addrspace(1) %in2, align 8 + %r2 = frem fast double %r0, %r1 + store double %r2, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; CHECK-LABEL: define amdgpu_kernel void @unsafe_frem_f64( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]]) { +; CHECK-NEXT: [[R0:%.*]] = load double, ptr addrspace(1) [[IN1]], align 8 +; CHECK-NEXT: [[R1:%.*]] = load double, ptr addrspace(1) [[IN2]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = fdiv double [[R0]], [[R1]] +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.trunc.f64(double [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = fneg double [[TMP2]] +; CHECK-NEXT: [[R2:%.*]] = call double @llvm.fma.f64(double [[TMP3]], double [[R1]], double [[R0]]) +; CHECK-NEXT: store double [[R2]], ptr addrspace(1) [[OUT]], align 8 +; CHECK-NEXT: ret void +; + ptr addrspace(1) %in2) { + %r0 = load double, ptr addrspace(1) %in1, align 8 + %r1 = load double, ptr addrspace(1) %in2, align 8 + %r2 = frem afn double %r0, %r1 + store double %r2, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; CHECK-LABEL: define amdgpu_kernel void @frem_v2f16( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]]) { +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr <2 x half>, ptr addrspace(1) [[IN2]], i32 4 +; CHECK-NEXT: [[R0:%.*]] = load <2 x half>, ptr addrspace(1) [[IN1]], align 8 +; CHECK-NEXT: [[R1:%.*]] = load <2 x half>, ptr addrspace(1) [[GEP2]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x half> [[R0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x half> [[R1]], i64 0 +; CHECK-NEXT: [[AX:%.*]] = call half @llvm.fabs.f16(half [[TMP1]]) +; CHECK-NEXT: [[AY:%.*]] = call half @llvm.fabs.f16(half [[TMP2]]) +; CHECK-NEXT: [[AX1:%.*]] = fpext half [[AX]] to float +; CHECK-NEXT: [[AY2:%.*]] = fpext half [[AY]] to float +; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt float [[AX1]], [[AY2]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] +; CHECK: [[BB4:.*]]: +; CHECK-NEXT: [[RET:%.*]] = phi half [ [[TMP38:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP29:%.*]], %[[FREM_ELSE]] ] +; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq half [[TMP2]], 0xH0000 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], half 0xH7E00, half [[RET]] +; CHECK-NEXT: [[TMP7:%.*]] = call half @llvm.fabs.f16(half [[TMP1]]) +; CHECK-NEXT: [[TMP8:%.*]] = fcmp ult half [[TMP7]], 0xH7C00 +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], half [[TMP6]], half 0xH7E00 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x half> poison, half [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x half> [[R0]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x half> [[R1]], i64 1 +; CHECK-NEXT: [[AX14:%.*]] = call half @llvm.fabs.f16(half [[TMP11]]) +; CHECK-NEXT: [[AY15:%.*]] = call half @llvm.fabs.f16(half [[TMP12]]) +; CHECK-NEXT: [[AX16:%.*]] = fpext half [[AX14]] to float +; CHECK-NEXT: [[AY17:%.*]] = fpext half [[AY15]] to float +; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt float [[AX16]], [[AY17]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE19:.*]], label %[[FREM_ELSE20:.*]] +; CHECK: [[BB14:.*]]: +; CHECK-NEXT: [[RET18:%.*]] = phi half [ [[TMP57:%.*]], %[[FREM_LOOP_EXIT28:.*]] ], [ [[TMP48:%.*]], %[[FREM_ELSE20]] ] +; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq half [[TMP12]], 0xH0000 +; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], half 0xH7E00, half [[RET18]] +; CHECK-NEXT: [[TMP17:%.*]] = call half @llvm.fabs.f16(half [[TMP11]]) +; CHECK-NEXT: [[TMP18:%.*]] = fcmp ult half [[TMP17]], 0xH7C00 +; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], half [[TMP16]], half 0xH7E00 +; CHECK-NEXT: [[R2:%.*]] = insertelement <2 x half> [[TMP10]], half [[TMP19]], i64 1 +; CHECK-NEXT: store <2 x half> [[R2]], ptr addrspace(1) [[OUT]], align 8 +; CHECK-NEXT: ret void +; CHECK: [[FREM_COMPUTE]]: +; CHECK-NEXT: [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX1]]) +; CHECK-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP20]], 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP20]], 1 +; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP22]], 1 +; CHECK-NEXT: [[AX3:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP21]], i32 11) +; CHECK-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY2]]) +; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP23]], 1 +; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP25]], 1 +; CHECK-NEXT: [[AY4:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP24]], i32 1) +; CHECK-NEXT: [[NB:%.*]] = sub i32 [[EX]], [[EY]] +; CHECK-NEXT: [[AYINV:%.*]] = fdiv float 1.000000e+00, [[AY4]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp sgt i32 [[NB]], 11 +; CHECK-NEXT: br i1 [[TMP26]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]] +; CHECK: [[FREM_ELSE]]: +; CHECK-NEXT: [[TMP27:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP1]]) +; CHECK-NEXT: [[TMP28:%.*]] = fcmp oeq float [[AX1]], [[AY2]] +; CHECK-NEXT: [[TMP29]] = select i1 [[TMP28]], half [[TMP27]], half [[TMP1]] +; CHECK-NEXT: br label %[[BB4]] +; CHECK: [[FREM_LOOP_BODY]]: +; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX3]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[TMP30:%.*]] = fmul float [[AX_LOOP_PHI]], [[AYINV]] +; CHECK-NEXT: [[Q:%.*]] = call float @llvm.rint.f32(float [[TMP30]]) +; CHECK-NEXT: [[TMP31:%.*]] = fneg float [[Q]] +; CHECK-NEXT: [[AX5:%.*]] = call float @llvm.fma.f32(float [[TMP31]], float [[AY4]], float [[AX_LOOP_PHI]]) +; CHECK-NEXT: [[CLT:%.*]] = fcmp olt float [[AX5]], 0.000000e+00 +; CHECK-NEXT: [[AXP:%.*]] = fadd float [[AX5]], [[AY4]] +; CHECK-NEXT: [[AX6:%.*]] = select i1 [[CLT]], float [[AXP]], float [[AX5]] +; CHECK-NEXT: [[AX_UPDATE]] = call float @llvm.ldexp.f32.i32(float [[AX6]], i32 11) +; CHECK-NEXT: [[NB_UPDATE]] = sub i32 [[NB_IV]], 11 +; CHECK-NEXT: [[TMP32:%.*]] = icmp sgt i32 [[NB_IV]], 11 +; CHECK-NEXT: br i1 [[TMP32]], label %[[FREM_LOOP_BODY]], label %[[FREM_LOOP_EXIT]] +; CHECK: [[FREM_LOOP_EXIT]]: +; CHECK-NEXT: [[AX_EXIT_PHI:%.*]] = phi float [ [[AX3]], %[[FREM_COMPUTE]] ], [ [[AX_LOOP_PHI]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[NB_EXIT_PHI:%.*]] = phi i32 [ [[NB_IV]], %[[FREM_LOOP_BODY]] ], [ [[NB]], %[[FREM_COMPUTE]] ] +; CHECK-NEXT: [[TMP33:%.*]] = sub i32 [[NB_EXIT_PHI]], 11 +; CHECK-NEXT: [[TMP34:%.*]] = add i32 [[TMP33]], 1 +; CHECK-NEXT: [[AX7:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX_EXIT_PHI]], i32 [[TMP34]]) +; CHECK-NEXT: [[TMP35:%.*]] = fmul float [[AX7]], [[AYINV]] +; CHECK-NEXT: [[Q8:%.*]] = call float @llvm.rint.f32(float [[TMP35]]) +; CHECK-NEXT: [[TMP36:%.*]] = fneg float [[Q8]] +; CHECK-NEXT: [[AX9:%.*]] = call float @llvm.fma.f32(float [[TMP36]], float [[AY4]], float [[AX7]]) +; CHECK-NEXT: [[CLT10:%.*]] = fcmp olt float [[AX9]], 0.000000e+00 +; CHECK-NEXT: [[AXP11:%.*]] = fadd float [[AX9]], [[AY4]] +; CHECK-NEXT: [[AX12:%.*]] = select i1 [[CLT10]], float [[AXP11]], float [[AX9]] +; CHECK-NEXT: [[AX13:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX12]], i32 [[EY]]) +; CHECK-NEXT: [[TMP37:%.*]] = fptrunc float [[AX13]] to half +; CHECK-NEXT: [[TMP38]] = call half @llvm.copysign.f16(half [[TMP37]], half [[TMP1]]) +; CHECK-NEXT: br label %[[BB4]] +; CHECK: [[FREM_COMPUTE19]]: +; CHECK-NEXT: [[TMP39:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX16]]) +; CHECK-NEXT: [[TMP40:%.*]] = extractvalue { float, i32 } [[TMP39]], 0 +; CHECK-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP39]], 1 +; CHECK-NEXT: [[EX21:%.*]] = sub i32 [[TMP41]], 1 +; CHECK-NEXT: [[AX22:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP40]], i32 11) +; CHECK-NEXT: [[TMP42:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY17]]) +; CHECK-NEXT: [[TMP43:%.*]] = extractvalue { float, i32 } [[TMP42]], 0 +; CHECK-NEXT: [[TMP44:%.*]] = extractvalue { float, i32 } [[TMP42]], 1 +; CHECK-NEXT: [[EY23:%.*]] = sub i32 [[TMP44]], 1 +; CHECK-NEXT: [[AY24:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP43]], i32 1) +; CHECK-NEXT: [[NB25:%.*]] = sub i32 [[EX21]], [[EY23]] +; CHECK-NEXT: [[AYINV26:%.*]] = fdiv float 1.000000e+00, [[AY24]] +; CHECK-NEXT: [[TMP45:%.*]] = icmp sgt i32 [[NB25]], 11 +; CHECK-NEXT: br i1 [[TMP45]], label %[[FREM_LOOP_BODY27:.*]], label %[[FREM_LOOP_EXIT28]] +; CHECK: [[FREM_ELSE20]]: +; CHECK-NEXT: [[TMP46:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP11]]) +; CHECK-NEXT: [[TMP47:%.*]] = fcmp oeq float [[AX16]], [[AY17]] +; CHECK-NEXT: [[TMP48]] = select i1 [[TMP47]], half [[TMP46]], half [[TMP11]] +; CHECK-NEXT: br label %[[BB14]] +; CHECK: [[FREM_LOOP_BODY27]]: +; CHECK-NEXT: [[NB_IV29:%.*]] = phi i32 [ [[NB25]], %[[FREM_COMPUTE19]] ], [ [[NB_UPDATE37:%.*]], %[[FREM_LOOP_BODY27]] ] +; CHECK-NEXT: [[AX_LOOP_PHI30:%.*]] = phi float [ [[AX22]], %[[FREM_COMPUTE19]] ], [ [[AX_UPDATE36:%.*]], %[[FREM_LOOP_BODY27]] ] +; CHECK-NEXT: [[TMP49:%.*]] = fmul float [[AX_LOOP_PHI30]], [[AYINV26]] +; CHECK-NEXT: [[Q31:%.*]] = call float @llvm.rint.f32(float [[TMP49]]) +; CHECK-NEXT: [[TMP50:%.*]] = fneg float [[Q31]] +; CHECK-NEXT: [[AX32:%.*]] = call float @llvm.fma.f32(float [[TMP50]], float [[AY24]], float [[AX_LOOP_PHI30]]) +; CHECK-NEXT: [[CLT33:%.*]] = fcmp olt float [[AX32]], 0.000000e+00 +; CHECK-NEXT: [[AXP34:%.*]] = fadd float [[AX32]], [[AY24]] +; CHECK-NEXT: [[AX35:%.*]] = select i1 [[CLT33]], float [[AXP34]], float [[AX32]] +; CHECK-NEXT: [[AX_UPDATE36]] = call float @llvm.ldexp.f32.i32(float [[AX35]], i32 11) +; CHECK-NEXT: [[NB_UPDATE37]] = sub i32 [[NB_IV29]], 11 +; CHECK-NEXT: [[TMP51:%.*]] = icmp sgt i32 [[NB_IV29]], 11 +; CHECK-NEXT: br i1 [[TMP51]], label %[[FREM_LOOP_BODY27]], label %[[FREM_LOOP_EXIT28]] +; CHECK: [[FREM_LOOP_EXIT28]]: +; CHECK-NEXT: [[AX_EXIT_PHI38:%.*]] = phi float [ [[AX22]], %[[FREM_COMPUTE19]] ], [ [[AX_LOOP_PHI30]], %[[FREM_LOOP_BODY27]] ] +; CHECK-NEXT: [[NB_EXIT_PHI39:%.*]] = phi i32 [ [[NB_IV29]], %[[FREM_LOOP_BODY27]] ], [ [[NB25]], %[[FREM_COMPUTE19]] ] +; CHECK-NEXT: [[TMP52:%.*]] = sub i32 [[NB_EXIT_PHI39]], 11 +; CHECK-NEXT: [[TMP53:%.*]] = add i32 [[TMP52]], 1 +; CHECK-NEXT: [[AX40:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX_EXIT_PHI38]], i32 [[TMP53]]) +; CHECK-NEXT: [[TMP54:%.*]] = fmul float [[AX40]], [[AYINV26]] +; CHECK-NEXT: [[Q41:%.*]] = call float @llvm.rint.f32(float [[TMP54]]) +; CHECK-NEXT: [[TMP55:%.*]] = fneg float [[Q41]] +; CHECK-NEXT: [[AX42:%.*]] = call float @llvm.fma.f32(float [[TMP55]], float [[AY24]], float [[AX40]]) +; CHECK-NEXT: [[CLT43:%.*]] = fcmp olt float [[AX42]], 0.000000e+00 +; CHECK-NEXT: [[AXP44:%.*]] = fadd float [[AX42]], [[AY24]] +; CHECK-NEXT: [[AX45:%.*]] = select i1 [[CLT43]], float [[AXP44]], float [[AX42]] +; CHECK-NEXT: [[AX46:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX45]], i32 [[EY23]]) +; CHECK-NEXT: [[TMP56:%.*]] = fptrunc float [[AX46]] to half +; CHECK-NEXT: [[TMP57]] = call half @llvm.copysign.f16(half [[TMP56]], half [[TMP11]]) +; CHECK-NEXT: br label %[[BB14]] +; + ptr addrspace(1) %in2) { + %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4 + %r0 = load <2 x half>, ptr addrspace(1) %in1, align 8 + %r1 = load <2 x half>, ptr addrspace(1) %gep2, align 8 + %r2 = frem <2 x half> %r0, %r1 + store <2 x half> %r2, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; CHECK-LABEL: define amdgpu_kernel void @frem_v4f16( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]]) { +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr <4 x half>, ptr addrspace(1) [[IN2]], i32 4 +; CHECK-NEXT: [[R0:%.*]] = load <4 x half>, ptr addrspace(1) [[IN1]], align 16 +; CHECK-NEXT: [[R1:%.*]] = load <4 x half>, ptr addrspace(1) [[GEP2]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x half> [[R0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x half> [[R1]], i64 0 +; CHECK-NEXT: [[AX:%.*]] = call half @llvm.fabs.f16(half [[TMP1]]) +; CHECK-NEXT: [[AY:%.*]] = call half @llvm.fabs.f16(half [[TMP2]]) +; CHECK-NEXT: [[AX1:%.*]] = fpext half [[AX]] to float +; CHECK-NEXT: [[AY2:%.*]] = fpext half [[AY]] to float +; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt float [[AX1]], [[AY2]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] +; CHECK: [[BB4:.*]]: +; CHECK-NEXT: [[RET:%.*]] = phi half [ [[TMP58:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP49:%.*]], %[[FREM_ELSE]] ] +; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq half [[TMP2]], 0xH0000 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], half 0xH7E00, half [[RET]] +; CHECK-NEXT: [[TMP7:%.*]] = call half @llvm.fabs.f16(half [[TMP1]]) +; CHECK-NEXT: [[TMP8:%.*]] = fcmp ult half [[TMP7]], 0xH7C00 +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], half [[TMP6]], half 0xH7E00 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x half> poison, half [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x half> [[R0]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x half> [[R1]], i64 1 +; CHECK-NEXT: [[AX14:%.*]] = call half @llvm.fabs.f16(half [[TMP11]]) +; CHECK-NEXT: [[AY15:%.*]] = call half @llvm.fabs.f16(half [[TMP12]]) +; CHECK-NEXT: [[AX16:%.*]] = fpext half [[AX14]] to float +; CHECK-NEXT: [[AY17:%.*]] = fpext half [[AY15]] to float +; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt float [[AX16]], [[AY17]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE19:.*]], label %[[FREM_ELSE20:.*]] +; CHECK: [[BB14:.*]]: +; CHECK-NEXT: [[RET18:%.*]] = phi half [ [[TMP77:%.*]], %[[FREM_LOOP_EXIT28:.*]] ], [ [[TMP68:%.*]], %[[FREM_ELSE20]] ] +; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq half [[TMP12]], 0xH0000 +; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], half 0xH7E00, half [[RET18]] +; CHECK-NEXT: [[TMP17:%.*]] = call half @llvm.fabs.f16(half [[TMP11]]) +; CHECK-NEXT: [[TMP18:%.*]] = fcmp ult half [[TMP17]], 0xH7C00 +; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], half [[TMP16]], half 0xH7E00 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x half> [[TMP10]], half [[TMP19]], i64 1 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x half> [[R0]], i64 2 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x half> [[R1]], i64 2 +; CHECK-NEXT: [[AX47:%.*]] = call half @llvm.fabs.f16(half [[TMP21]]) +; CHECK-NEXT: [[AY48:%.*]] = call half @llvm.fabs.f16(half [[TMP22]]) +; CHECK-NEXT: [[AX49:%.*]] = fpext half [[AX47]] to float +; CHECK-NEXT: [[AY50:%.*]] = fpext half [[AY48]] to float +; CHECK-NEXT: [[TMP23:%.*]] = fcmp ogt float [[AX49]], [[AY50]] +; CHECK-NEXT: br i1 [[TMP23]], label %[[FREM_COMPUTE52:.*]], label %[[FREM_ELSE53:.*]] +; CHECK: [[BB24:.*]]: +; CHECK-NEXT: [[RET51:%.*]] = phi half [ [[TMP96:%.*]], %[[FREM_LOOP_EXIT61:.*]] ], [ [[TMP87:%.*]], %[[FREM_ELSE53]] ] +; CHECK-NEXT: [[TMP25:%.*]] = fcmp ueq half [[TMP22]], 0xH0000 +; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], half 0xH7E00, half [[RET51]] +; CHECK-NEXT: [[TMP27:%.*]] = call half @llvm.fabs.f16(half [[TMP21]]) +; CHECK-NEXT: [[TMP28:%.*]] = fcmp ult half [[TMP27]], 0xH7C00 +; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], half [[TMP26]], half 0xH7E00 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x half> [[TMP20]], half [[TMP29]], i64 2 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x half> [[R0]], i64 3 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x half> [[R1]], i64 3 +; CHECK-NEXT: [[AX80:%.*]] = call half @llvm.fabs.f16(half [[TMP31]]) +; CHECK-NEXT: [[AY81:%.*]] = call half @llvm.fabs.f16(half [[TMP32]]) +; CHECK-NEXT: [[AX82:%.*]] = fpext half [[AX80]] to float +; CHECK-NEXT: [[AY83:%.*]] = fpext half [[AY81]] to float +; CHECK-NEXT: [[TMP33:%.*]] = fcmp ogt float [[AX82]], [[AY83]] +; CHECK-NEXT: br i1 [[TMP33]], label %[[FREM_COMPUTE85:.*]], label %[[FREM_ELSE86:.*]] +; CHECK: [[BB34:.*]]: +; CHECK-NEXT: [[RET84:%.*]] = phi half [ [[TMP115:%.*]], %[[FREM_LOOP_EXIT94:.*]] ], [ [[TMP106:%.*]], %[[FREM_ELSE86]] ] +; CHECK-NEXT: [[TMP35:%.*]] = fcmp ueq half [[TMP32]], 0xH0000 +; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], half 0xH7E00, half [[RET84]] +; CHECK-NEXT: [[TMP37:%.*]] = call half @llvm.fabs.f16(half [[TMP31]]) +; CHECK-NEXT: [[TMP38:%.*]] = fcmp ult half [[TMP37]], 0xH7C00 +; CHECK-NEXT: [[TMP39:%.*]] = select i1 [[TMP38]], half [[TMP36]], half 0xH7E00 +; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x half> [[TMP30]], half [[TMP39]], i64 3 +; CHECK-NEXT: store <4 x half> [[R2]], ptr addrspace(1) [[OUT]], align 16 +; CHECK-NEXT: ret void +; CHECK: [[FREM_COMPUTE]]: +; CHECK-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX1]]) +; CHECK-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP40]], 0 +; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { float, i32 } [[TMP40]], 1 +; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP42]], 1 +; CHECK-NEXT: [[AX3:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP41]], i32 11) +; CHECK-NEXT: [[TMP43:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY2]]) +; CHECK-NEXT: [[TMP44:%.*]] = extractvalue { float, i32 } [[TMP43]], 0 +; CHECK-NEXT: [[TMP45:%.*]] = extractvalue { float, i32 } [[TMP43]], 1 +; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP45]], 1 +; CHECK-NEXT: [[AY4:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP44]], i32 1) +; CHECK-NEXT: [[NB:%.*]] = sub i32 [[EX]], [[EY]] +; CHECK-NEXT: [[AYINV:%.*]] = fdiv float 1.000000e+00, [[AY4]] +; CHECK-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[NB]], 11 +; CHECK-NEXT: br i1 [[TMP46]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]] +; CHECK: [[FREM_ELSE]]: +; CHECK-NEXT: [[TMP47:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP1]]) +; CHECK-NEXT: [[TMP48:%.*]] = fcmp oeq float [[AX1]], [[AY2]] +; CHECK-NEXT: [[TMP49]] = select i1 [[TMP48]], half [[TMP47]], half [[TMP1]] +; CHECK-NEXT: br label %[[BB4]] +; CHECK: [[FREM_LOOP_BODY]]: +; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX3]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[TMP50:%.*]] = fmul float [[AX_LOOP_PHI]], [[AYINV]] +; CHECK-NEXT: [[Q:%.*]] = call float @llvm.rint.f32(float [[TMP50]]) +; CHECK-NEXT: [[TMP51:%.*]] = fneg float [[Q]] +; CHECK-NEXT: [[AX5:%.*]] = call float @llvm.fma.f32(float [[TMP51]], float [[AY4]], float [[AX_LOOP_PHI]]) +; CHECK-NEXT: [[CLT:%.*]] = fcmp olt float [[AX5]], 0.000000e+00 +; CHECK-NEXT: [[AXP:%.*]] = fadd float [[AX5]], [[AY4]] +; CHECK-NEXT: [[AX6:%.*]] = select i1 [[CLT]], float [[AXP]], float [[AX5]] +; CHECK-NEXT: [[AX_UPDATE]] = call float @llvm.ldexp.f32.i32(float [[AX6]], i32 11) +; CHECK-NEXT: [[NB_UPDATE]] = sub i32 [[NB_IV]], 11 +; CHECK-NEXT: [[TMP52:%.*]] = icmp sgt i32 [[NB_IV]], 11 +; CHECK-NEXT: br i1 [[TMP52]], label %[[FREM_LOOP_BODY]], label %[[FREM_LOOP_EXIT]] +; CHECK: [[FREM_LOOP_EXIT]]: +; CHECK-NEXT: [[AX_EXIT_PHI:%.*]] = phi float [ [[AX3]], %[[FREM_COMPUTE]] ], [ [[AX_LOOP_PHI]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[NB_EXIT_PHI:%.*]] = phi i32 [ [[NB_IV]], %[[FREM_LOOP_BODY]] ], [ [[NB]], %[[FREM_COMPUTE]] ] +; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[NB_EXIT_PHI]], 11 +; CHECK-NEXT: [[TMP54:%.*]] = add i32 [[TMP53]], 1 +; CHECK-NEXT: [[AX7:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX_EXIT_PHI]], i32 [[TMP54]]) +; CHECK-NEXT: [[TMP55:%.*]] = fmul float [[AX7]], [[AYINV]] +; CHECK-NEXT: [[Q8:%.*]] = call float @llvm.rint.f32(float [[TMP55]]) +; CHECK-NEXT: [[TMP56:%.*]] = fneg float [[Q8]] +; CHECK-NEXT: [[AX9:%.*]] = call float @llvm.fma.f32(float [[TMP56]], float [[AY4]], float [[AX7]]) +; CHECK-NEXT: [[CLT10:%.*]] = fcmp olt float [[AX9]], 0.000000e+00 +; CHECK-NEXT: [[AXP11:%.*]] = fadd float [[AX9]], [[AY4]] +; CHECK-NEXT: [[AX12:%.*]] = select i1 [[CLT10]], float [[AXP11]], float [[AX9]] +; CHECK-NEXT: [[AX13:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX12]], i32 [[EY]]) +; CHECK-NEXT: [[TMP57:%.*]] = fptrunc float [[AX13]] to half +; CHECK-NEXT: [[TMP58]] = call half @llvm.copysign.f16(half [[TMP57]], half [[TMP1]]) +; CHECK-NEXT: br label %[[BB4]] +; CHECK: [[FREM_COMPUTE19]]: +; CHECK-NEXT: [[TMP59:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX16]]) +; CHECK-NEXT: [[TMP60:%.*]] = extractvalue { float, i32 } [[TMP59]], 0 +; CHECK-NEXT: [[TMP61:%.*]] = extractvalue { float, i32 } [[TMP59]], 1 +; CHECK-NEXT: [[EX21:%.*]] = sub i32 [[TMP61]], 1 +; CHECK-NEXT: [[AX22:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP60]], i32 11) +; CHECK-NEXT: [[TMP62:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY17]]) +; CHECK-NEXT: [[TMP63:%.*]] = extractvalue { float, i32 } [[TMP62]], 0 +; CHECK-NEXT: [[TMP64:%.*]] = extractvalue { float, i32 } [[TMP62]], 1 +; CHECK-NEXT: [[EY23:%.*]] = sub i32 [[TMP64]], 1 +; CHECK-NEXT: [[AY24:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP63]], i32 1) +; CHECK-NEXT: [[NB25:%.*]] = sub i32 [[EX21]], [[EY23]] +; CHECK-NEXT: [[AYINV26:%.*]] = fdiv float 1.000000e+00, [[AY24]] +; CHECK-NEXT: [[TMP65:%.*]] = icmp sgt i32 [[NB25]], 11 +; CHECK-NEXT: br i1 [[TMP65]], label %[[FREM_LOOP_BODY27:.*]], label %[[FREM_LOOP_EXIT28]] +; CHECK: [[FREM_ELSE20]]: +; CHECK-NEXT: [[TMP66:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP11]]) +; CHECK-NEXT: [[TMP67:%.*]] = fcmp oeq float [[AX16]], [[AY17]] +; CHECK-NEXT: [[TMP68]] = select i1 [[TMP67]], half [[TMP66]], half [[TMP11]] +; CHECK-NEXT: br label %[[BB14]] +; CHECK: [[FREM_LOOP_BODY27]]: +; CHECK-NEXT: [[NB_IV29:%.*]] = phi i32 [ [[NB25]], %[[FREM_COMPUTE19]] ], [ [[NB_UPDATE37:%.*]], %[[FREM_LOOP_BODY27]] ] +; CHECK-NEXT: [[AX_LOOP_PHI30:%.*]] = phi float [ [[AX22]], %[[FREM_COMPUTE19]] ], [ [[AX_UPDATE36:%.*]], %[[FREM_LOOP_BODY27]] ] +; CHECK-NEXT: [[TMP69:%.*]] = fmul float [[AX_LOOP_PHI30]], [[AYINV26]] +; CHECK-NEXT: [[Q31:%.*]] = call float @llvm.rint.f32(float [[TMP69]]) +; CHECK-NEXT: [[TMP70:%.*]] = fneg float [[Q31]] +; CHECK-NEXT: [[AX32:%.*]] = call float @llvm.fma.f32(float [[TMP70]], float [[AY24]], float [[AX_LOOP_PHI30]]) +; CHECK-NEXT: [[CLT33:%.*]] = fcmp olt float [[AX32]], 0.000000e+00 +; CHECK-NEXT: [[AXP34:%.*]] = fadd float [[AX32]], [[AY24]] +; CHECK-NEXT: [[AX35:%.*]] = select i1 [[CLT33]], float [[AXP34]], float [[AX32]] +; CHECK-NEXT: [[AX_UPDATE36]] = call float @llvm.ldexp.f32.i32(float [[AX35]], i32 11) +; CHECK-NEXT: [[NB_UPDATE37]] = sub i32 [[NB_IV29]], 11 +; CHECK-NEXT: [[TMP71:%.*]] = icmp sgt i32 [[NB_IV29]], 11 +; CHECK-NEXT: br i1 [[TMP71]], label %[[FREM_LOOP_BODY27]], label %[[FREM_LOOP_EXIT28]] +; CHECK: [[FREM_LOOP_EXIT28]]: +; CHECK-NEXT: [[AX_EXIT_PHI38:%.*]] = phi float [ [[AX22]], %[[FREM_COMPUTE19]] ], [ [[AX_LOOP_PHI30]], %[[FREM_LOOP_BODY27]] ] +; CHECK-NEXT: [[NB_EXIT_PHI39:%.*]] = phi i32 [ [[NB_IV29]], %[[FREM_LOOP_BODY27]] ], [ [[NB25]], %[[FREM_COMPUTE19]] ] +; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[NB_EXIT_PHI39]], 11 +; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP72]], 1 +; CHECK-NEXT: [[AX40:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX_EXIT_PHI38]], i32 [[TMP73]]) +; CHECK-NEXT: [[TMP74:%.*]] = fmul float [[AX40]], [[AYINV26]] +; CHECK-NEXT: [[Q41:%.*]] = call float @llvm.rint.f32(float [[TMP74]]) +; CHECK-NEXT: [[TMP75:%.*]] = fneg float [[Q41]] +; CHECK-NEXT: [[AX42:%.*]] = call float @llvm.fma.f32(float [[TMP75]], float [[AY24]], float [[AX40]]) +; CHECK-NEXT: [[CLT43:%.*]] = fcmp olt float [[AX42]], 0.000000e+00 +; CHECK-NEXT: [[AXP44:%.*]] = fadd float [[AX42]], [[AY24]] +; CHECK-NEXT: [[AX45:%.*]] = select i1 [[CLT43]], float [[AXP44]], float [[AX42]] +; CHECK-NEXT: [[AX46:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX45]], i32 [[EY23]]) +; CHECK-NEXT: [[TMP76:%.*]] = fptrunc float [[AX46]] to half +; CHECK-NEXT: [[TMP77]] = call half @llvm.copysign.f16(half [[TMP76]], half [[TMP11]]) +; CHECK-NEXT: br label %[[BB14]] +; CHECK: [[FREM_COMPUTE52]]: +; CHECK-NEXT: [[TMP78:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX49]]) +; CHECK-NEXT: [[TMP79:%.*]] = extractvalue { float, i32 } [[TMP78]], 0 +; CHECK-NEXT: [[TMP80:%.*]] = extractvalue { float, i32 } [[TMP78]], 1 +; CHECK-NEXT: [[EX54:%.*]] = sub i32 [[TMP80]], 1 +; CHECK-NEXT: [[AX55:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP79]], i32 11) +; CHECK-NEXT: [[TMP81:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY50]]) +; CHECK-NEXT: [[TMP82:%.*]] = extractvalue { float, i32 } [[TMP81]], 0 +; CHECK-NEXT: [[TMP83:%.*]] = extractvalue { float, i32 } [[TMP81]], 1 +; CHECK-NEXT: [[EY56:%.*]] = sub i32 [[TMP83]], 1 +; CHECK-NEXT: [[AY57:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP82]], i32 1) +; CHECK-NEXT: [[NB58:%.*]] = sub i32 [[EX54]], [[EY56]] +; CHECK-NEXT: [[AYINV59:%.*]] = fdiv float 1.000000e+00, [[AY57]] +; CHECK-NEXT: [[TMP84:%.*]] = icmp sgt i32 [[NB58]], 11 +; CHECK-NEXT: br i1 [[TMP84]], label %[[FREM_LOOP_BODY60:.*]], label %[[FREM_LOOP_EXIT61]] +; CHECK: [[FREM_ELSE53]]: +; CHECK-NEXT: [[TMP85:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP21]]) +; CHECK-NEXT: [[TMP86:%.*]] = fcmp oeq float [[AX49]], [[AY50]] +; CHECK-NEXT: [[TMP87]] = select i1 [[TMP86]], half [[TMP85]], half [[TMP21]] +; CHECK-NEXT: br label %[[BB24]] +; CHECK: [[FREM_LOOP_BODY60]]: +; CHECK-NEXT: [[NB_IV62:%.*]] = phi i32 [ [[NB58]], %[[FREM_COMPUTE52]] ], [ [[NB_UPDATE70:%.*]], %[[FREM_LOOP_BODY60]] ] +; CHECK-NEXT: [[AX_LOOP_PHI63:%.*]] = phi float [ [[AX55]], %[[FREM_COMPUTE52]] ], [ [[AX_UPDATE69:%.*]], %[[FREM_LOOP_BODY60]] ] +; CHECK-NEXT: [[TMP88:%.*]] = fmul float [[AX_LOOP_PHI63]], [[AYINV59]] +; CHECK-NEXT: [[Q64:%.*]] = call float @llvm.rint.f32(float [[TMP88]]) +; CHECK-NEXT: [[TMP89:%.*]] = fneg float [[Q64]] +; CHECK-NEXT: [[AX65:%.*]] = call float @llvm.fma.f32(float [[TMP89]], float [[AY57]], float [[AX_LOOP_PHI63]]) +; CHECK-NEXT: [[CLT66:%.*]] = fcmp olt float [[AX65]], 0.000000e+00 +; CHECK-NEXT: [[AXP67:%.*]] = fadd float [[AX65]], [[AY57]] +; CHECK-NEXT: [[AX68:%.*]] = select i1 [[CLT66]], float [[AXP67]], float [[AX65]] +; CHECK-NEXT: [[AX_UPDATE69]] = call float @llvm.ldexp.f32.i32(float [[AX68]], i32 11) +; CHECK-NEXT: [[NB_UPDATE70]] = sub i32 [[NB_IV62]], 11 +; CHECK-NEXT: [[TMP90:%.*]] = icmp sgt i32 [[NB_IV62]], 11 +; CHECK-NEXT: br i1 [[TMP90]], label %[[FREM_LOOP_BODY60]], label %[[FREM_LOOP_EXIT61]] +; CHECK: [[FREM_LOOP_EXIT61]]: +; CHECK-NEXT: [[AX_EXIT_PHI71:%.*]] = phi float [ [[AX55]], %[[FREM_COMPUTE52]] ], [ [[AX_LOOP_PHI63]], %[[FREM_LOOP_BODY60]] ] +; CHECK-NEXT: [[NB_EXIT_PHI72:%.*]] = phi i32 [ [[NB_IV62]], %[[FREM_LOOP_BODY60]] ], [ [[NB58]], %[[FREM_COMPUTE52]] ] +; CHECK-NEXT: [[TMP91:%.*]] = sub i32 [[NB_EXIT_PHI72]], 11 +; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP91]], 1 +; CHECK-NEXT: [[AX73:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX_EXIT_PHI71]], i32 [[TMP92]]) +; CHECK-NEXT: [[TMP93:%.*]] = fmul float [[AX73]], [[AYINV59]] +; CHECK-NEXT: [[Q74:%.*]] = call float @llvm.rint.f32(float [[TMP93]]) +; CHECK-NEXT: [[TMP94:%.*]] = fneg float [[Q74]] +; CHECK-NEXT: [[AX75:%.*]] = call float @llvm.fma.f32(float [[TMP94]], float [[AY57]], float [[AX73]]) +; CHECK-NEXT: [[CLT76:%.*]] = fcmp olt float [[AX75]], 0.000000e+00 +; CHECK-NEXT: [[AXP77:%.*]] = fadd float [[AX75]], [[AY57]] +; CHECK-NEXT: [[AX78:%.*]] = select i1 [[CLT76]], float [[AXP77]], float [[AX75]] +; CHECK-NEXT: [[AX79:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX78]], i32 [[EY56]]) +; CHECK-NEXT: [[TMP95:%.*]] = fptrunc float [[AX79]] to half +; CHECK-NEXT: [[TMP96]] = call half @llvm.copysign.f16(half [[TMP95]], half [[TMP21]]) +; CHECK-NEXT: br label %[[BB24]] +; CHECK: [[FREM_COMPUTE85]]: +; CHECK-NEXT: [[TMP97:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX82]]) +; CHECK-NEXT: [[TMP98:%.*]] = extractvalue { float, i32 } [[TMP97]], 0 +; CHECK-NEXT: [[TMP99:%.*]] = extractvalue { float, i32 } [[TMP97]], 1 +; CHECK-NEXT: [[EX87:%.*]] = sub i32 [[TMP99]], 1 +; CHECK-NEXT: [[AX88:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP98]], i32 11) +; CHECK-NEXT: [[TMP100:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY83]]) +; CHECK-NEXT: [[TMP101:%.*]] = extractvalue { float, i32 } [[TMP100]], 0 +; CHECK-NEXT: [[TMP102:%.*]] = extractvalue { float, i32 } [[TMP100]], 1 +; CHECK-NEXT: [[EY89:%.*]] = sub i32 [[TMP102]], 1 +; CHECK-NEXT: [[AY90:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP101]], i32 1) +; CHECK-NEXT: [[NB91:%.*]] = sub i32 [[EX87]], [[EY89]] +; CHECK-NEXT: [[AYINV92:%.*]] = fdiv float 1.000000e+00, [[AY90]] +; CHECK-NEXT: [[TMP103:%.*]] = icmp sgt i32 [[NB91]], 11 +; CHECK-NEXT: br i1 [[TMP103]], label %[[FREM_LOOP_BODY93:.*]], label %[[FREM_LOOP_EXIT94]] +; CHECK: [[FREM_ELSE86]]: +; CHECK-NEXT: [[TMP104:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP31]]) +; CHECK-NEXT: [[TMP105:%.*]] = fcmp oeq float [[AX82]], [[AY83]] +; CHECK-NEXT: [[TMP106]] = select i1 [[TMP105]], half [[TMP104]], half [[TMP31]] +; CHECK-NEXT: br label %[[BB34]] +; CHECK: [[FREM_LOOP_BODY93]]: +; CHECK-NEXT: [[NB_IV95:%.*]] = phi i32 [ [[NB91]], %[[FREM_COMPUTE85]] ], [ [[NB_UPDATE103:%.*]], %[[FREM_LOOP_BODY93]] ] +; CHECK-NEXT: [[AX_LOOP_PHI96:%.*]] = phi float [ [[AX88]], %[[FREM_COMPUTE85]] ], [ [[AX_UPDATE102:%.*]], %[[FREM_LOOP_BODY93]] ] +; CHECK-NEXT: [[TMP107:%.*]] = fmul float [[AX_LOOP_PHI96]], [[AYINV92]] +; CHECK-NEXT: [[Q97:%.*]] = call float @llvm.rint.f32(float [[TMP107]]) +; CHECK-NEXT: [[TMP108:%.*]] = fneg float [[Q97]] +; CHECK-NEXT: [[AX98:%.*]] = call float @llvm.fma.f32(float [[TMP108]], float [[AY90]], float [[AX_LOOP_PHI96]]) +; CHECK-NEXT: [[CLT99:%.*]] = fcmp olt float [[AX98]], 0.000000e+00 +; CHECK-NEXT: [[AXP100:%.*]] = fadd float [[AX98]], [[AY90]] +; CHECK-NEXT: [[AX101:%.*]] = select i1 [[CLT99]], float [[AXP100]], float [[AX98]] +; CHECK-NEXT: [[AX_UPDATE102]] = call float @llvm.ldexp.f32.i32(float [[AX101]], i32 11) +; CHECK-NEXT: [[NB_UPDATE103]] = sub i32 [[NB_IV95]], 11 +; CHECK-NEXT: [[TMP109:%.*]] = icmp sgt i32 [[NB_IV95]], 11 +; CHECK-NEXT: br i1 [[TMP109]], label %[[FREM_LOOP_BODY93]], label %[[FREM_LOOP_EXIT94]] +; CHECK: [[FREM_LOOP_EXIT94]]: +; CHECK-NEXT: [[AX_EXIT_PHI104:%.*]] = phi float [ [[AX88]], %[[FREM_COMPUTE85]] ], [ [[AX_LOOP_PHI96]], %[[FREM_LOOP_BODY93]] ] +; CHECK-NEXT: [[NB_EXIT_PHI105:%.*]] = phi i32 [ [[NB_IV95]], %[[FREM_LOOP_BODY93]] ], [ [[NB91]], %[[FREM_COMPUTE85]] ] +; CHECK-NEXT: [[TMP110:%.*]] = sub i32 [[NB_EXIT_PHI105]], 11 +; CHECK-NEXT: [[TMP111:%.*]] = add i32 [[TMP110]], 1 +; CHECK-NEXT: [[AX106:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX_EXIT_PHI104]], i32 [[TMP111]]) +; CHECK-NEXT: [[TMP112:%.*]] = fmul float [[AX106]], [[AYINV92]] +; CHECK-NEXT: [[Q107:%.*]] = call float @llvm.rint.f32(float [[TMP112]]) +; CHECK-NEXT: [[TMP113:%.*]] = fneg float [[Q107]] +; CHECK-NEXT: [[AX108:%.*]] = call float @llvm.fma.f32(float [[TMP113]], float [[AY90]], float [[AX106]]) +; CHECK-NEXT: [[CLT109:%.*]] = fcmp olt float [[AX108]], 0.000000e+00 +; CHECK-NEXT: [[AXP110:%.*]] = fadd float [[AX108]], [[AY90]] +; CHECK-NEXT: [[AX111:%.*]] = select i1 [[CLT109]], float [[AXP110]], float [[AX108]] +; CHECK-NEXT: [[AX112:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX111]], i32 [[EY89]]) +; CHECK-NEXT: [[TMP114:%.*]] = fptrunc float [[AX112]] to half +; CHECK-NEXT: [[TMP115]] = call half @llvm.copysign.f16(half [[TMP114]], half [[TMP31]]) +; CHECK-NEXT: br label %[[BB34]] +; + ptr addrspace(1) %in2) { + %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4 + %r0 = load <4 x half>, ptr addrspace(1) %in1, align 16 + %r1 = load <4 x half>, ptr addrspace(1) %gep2, align 16 + %r2 = frem <4 x half> %r0, %r1 + store <4 x half> %r2, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; CHECK-LABEL: define amdgpu_kernel void @frem_v2f32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]]) { +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr <2 x float>, ptr addrspace(1) [[IN2]], i32 4 +; CHECK-NEXT: [[R0:%.*]] = load <2 x float>, ptr addrspace(1) [[IN1]], align 8 +; CHECK-NEXT: [[R1:%.*]] = load <2 x float>, ptr addrspace(1) [[GEP2]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[R0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[R1]], i64 0 +; CHECK-NEXT: [[AX:%.*]] = call float @llvm.fabs.f32(float [[TMP1]]) +; CHECK-NEXT: [[AY:%.*]] = call float @llvm.fabs.f32(float [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt float [[AX]], [[AY]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] +; CHECK: [[BB4:.*]]: +; CHECK-NEXT: [[RET:%.*]] = phi float [ [[TMP37:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP29:%.*]], %[[FREM_ELSE]] ] +; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq float [[TMP2]], 0.000000e+00 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float 0x7FF8000000000000, float [[RET]] +; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.fabs.f32(float [[TMP1]]) +; CHECK-NEXT: [[TMP8:%.*]] = fcmp ult float [[TMP7]], 0x7FF0000000000000 +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float 0x7FF8000000000000 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[R0]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[R1]], i64 1 +; CHECK-NEXT: [[AX12:%.*]] = call float @llvm.fabs.f32(float [[TMP11]]) +; CHECK-NEXT: [[AY13:%.*]] = call float @llvm.fabs.f32(float [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt float [[AX12]], [[AY13]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]] +; CHECK: [[BB14:.*]]: +; CHECK-NEXT: [[RET14:%.*]] = phi float [ [[TMP55:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP47:%.*]], %[[FREM_ELSE16]] ] +; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq float [[TMP12]], 0.000000e+00 +; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], float 0x7FF8000000000000, float [[RET14]] +; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fabs.f32(float [[TMP11]]) +; CHECK-NEXT: [[TMP18:%.*]] = fcmp ult float [[TMP17]], 0x7FF0000000000000 +; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], float [[TMP16]], float 0x7FF8000000000000 +; CHECK-NEXT: [[R2:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP19]], i64 1 +; CHECK-NEXT: store <2 x float> [[R2]], ptr addrspace(1) [[OUT]], align 8 +; CHECK-NEXT: ret void +; CHECK: [[FREM_COMPUTE]]: +; CHECK-NEXT: [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX]]) +; CHECK-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP20]], 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP20]], 1 +; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP22]], 1 +; CHECK-NEXT: [[AX1:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP21]], i32 12) +; CHECK-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY]]) +; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP23]], 1 +; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP25]], 1 +; CHECK-NEXT: [[AY2:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP24]], i32 1) +; CHECK-NEXT: [[NB:%.*]] = sub i32 [[EX]], [[EY]] +; CHECK-NEXT: [[AYINV:%.*]] = fdiv float 1.000000e+00, [[AY2]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp sgt i32 [[NB]], 12 +; CHECK-NEXT: br i1 [[TMP26]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]] +; CHECK: [[FREM_ELSE]]: +; CHECK-NEXT: [[TMP27:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]]) +; CHECK-NEXT: [[TMP28:%.*]] = fcmp oeq float [[AX]], [[AY]] +; CHECK-NEXT: [[TMP29]] = select i1 [[TMP28]], float [[TMP27]], float [[TMP1]] +; CHECK-NEXT: br label %[[BB4]] +; CHECK: [[FREM_LOOP_BODY]]: +; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[TMP30:%.*]] = fmul float [[AX_LOOP_PHI]], [[AYINV]] +; CHECK-NEXT: [[Q:%.*]] = call float @llvm.rint.f32(float [[TMP30]]) +; CHECK-NEXT: [[TMP31:%.*]] = fneg float [[Q]] +; CHECK-NEXT: [[AX3:%.*]] = call float @llvm.fma.f32(float [[TMP31]], float [[AY2]], float [[AX_LOOP_PHI]]) +; CHECK-NEXT: [[CLT:%.*]] = fcmp olt float [[AX3]], 0.000000e+00 +; CHECK-NEXT: [[AXP:%.*]] = fadd float [[AX3]], [[AY2]] +; CHECK-NEXT: [[AX4:%.*]] = select i1 [[CLT]], float [[AXP]], float [[AX3]] +; CHECK-NEXT: [[AX_UPDATE]] = call float @llvm.ldexp.f32.i32(float [[AX4]], i32 12) +; CHECK-NEXT: [[NB_UPDATE]] = sub i32 [[NB_IV]], 12 +; CHECK-NEXT: [[TMP32:%.*]] = icmp sgt i32 [[NB_IV]], 12 +; CHECK-NEXT: br i1 [[TMP32]], label %[[FREM_LOOP_BODY]], label %[[FREM_LOOP_EXIT]] +; CHECK: [[FREM_LOOP_EXIT]]: +; CHECK-NEXT: [[AX_EXIT_PHI:%.*]] = phi float [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_LOOP_PHI]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[NB_EXIT_PHI:%.*]] = phi i32 [ [[NB_IV]], %[[FREM_LOOP_BODY]] ], [ [[NB]], %[[FREM_COMPUTE]] ] +; CHECK-NEXT: [[TMP33:%.*]] = sub i32 [[NB_EXIT_PHI]], 12 +; CHECK-NEXT: [[TMP34:%.*]] = add i32 [[TMP33]], 1 +; CHECK-NEXT: [[AX5:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX_EXIT_PHI]], i32 [[TMP34]]) +; CHECK-NEXT: [[TMP35:%.*]] = fmul float [[AX5]], [[AYINV]] +; CHECK-NEXT: [[Q6:%.*]] = call float @llvm.rint.f32(float [[TMP35]]) +; CHECK-NEXT: [[TMP36:%.*]] = fneg float [[Q6]] +; CHECK-NEXT: [[AX7:%.*]] = call float @llvm.fma.f32(float [[TMP36]], float [[AY2]], float [[AX5]]) +; CHECK-NEXT: [[CLT8:%.*]] = fcmp olt float [[AX7]], 0.000000e+00 +; CHECK-NEXT: [[AXP9:%.*]] = fadd float [[AX7]], [[AY2]] +; CHECK-NEXT: [[AX10:%.*]] = select i1 [[CLT8]], float [[AXP9]], float [[AX7]] +; CHECK-NEXT: [[AX11:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX10]], i32 [[EY]]) +; CHECK-NEXT: [[TMP37]] = call float @llvm.copysign.f32(float [[AX11]], float [[TMP1]]) +; CHECK-NEXT: br label %[[BB4]] +; CHECK: [[FREM_COMPUTE15]]: +; CHECK-NEXT: [[TMP38:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX12]]) +; CHECK-NEXT: [[TMP39:%.*]] = extractvalue { float, i32 } [[TMP38]], 0 +; CHECK-NEXT: [[TMP40:%.*]] = extractvalue { float, i32 } [[TMP38]], 1 +; CHECK-NEXT: [[EX17:%.*]] = sub i32 [[TMP40]], 1 +; CHECK-NEXT: [[AX18:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP39]], i32 12) +; CHECK-NEXT: [[TMP41:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY13]]) +; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { float, i32 } [[TMP41]], 0 +; CHECK-NEXT: [[TMP43:%.*]] = extractvalue { float, i32 } [[TMP41]], 1 +; CHECK-NEXT: [[EY19:%.*]] = sub i32 [[TMP43]], 1 +; CHECK-NEXT: [[AY20:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP42]], i32 1) +; CHECK-NEXT: [[NB21:%.*]] = sub i32 [[EX17]], [[EY19]] +; CHECK-NEXT: [[AYINV22:%.*]] = fdiv float 1.000000e+00, [[AY20]] +; CHECK-NEXT: [[TMP44:%.*]] = icmp sgt i32 [[NB21]], 12 +; CHECK-NEXT: br i1 [[TMP44]], label %[[FREM_LOOP_BODY23:.*]], label %[[FREM_LOOP_EXIT24]] +; CHECK: [[FREM_ELSE16]]: +; CHECK-NEXT: [[TMP45:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP11]]) +; CHECK-NEXT: [[TMP46:%.*]] = fcmp oeq float [[AX12]], [[AY13]] +; CHECK-NEXT: [[TMP47]] = select i1 [[TMP46]], float [[TMP45]], float [[TMP11]] +; CHECK-NEXT: br label %[[BB14]] +; CHECK: [[FREM_LOOP_BODY23]]: +; CHECK-NEXT: [[NB_IV25:%.*]] = phi i32 [ [[NB21]], %[[FREM_COMPUTE15]] ], [ [[NB_UPDATE33:%.*]], %[[FREM_LOOP_BODY23]] ] +; CHECK-NEXT: [[AX_LOOP_PHI26:%.*]] = phi float [ [[AX18]], %[[FREM_COMPUTE15]] ], [ [[AX_UPDATE32:%.*]], %[[FREM_LOOP_BODY23]] ] +; CHECK-NEXT: [[TMP48:%.*]] = fmul float [[AX_LOOP_PHI26]], [[AYINV22]] +; CHECK-NEXT: [[Q27:%.*]] = call float @llvm.rint.f32(float [[TMP48]]) +; CHECK-NEXT: [[TMP49:%.*]] = fneg float [[Q27]] +; CHECK-NEXT: [[AX28:%.*]] = call float @llvm.fma.f32(float [[TMP49]], float [[AY20]], float [[AX_LOOP_PHI26]]) +; CHECK-NEXT: [[CLT29:%.*]] = fcmp olt float [[AX28]], 0.000000e+00 +; CHECK-NEXT: [[AXP30:%.*]] = fadd float [[AX28]], [[AY20]] +; CHECK-NEXT: [[AX31:%.*]] = select i1 [[CLT29]], float [[AXP30]], float [[AX28]] +; CHECK-NEXT: [[AX_UPDATE32]] = call float @llvm.ldexp.f32.i32(float [[AX31]], i32 12) +; CHECK-NEXT: [[NB_UPDATE33]] = sub i32 [[NB_IV25]], 12 +; CHECK-NEXT: [[TMP50:%.*]] = icmp sgt i32 [[NB_IV25]], 12 +; CHECK-NEXT: br i1 [[TMP50]], label %[[FREM_LOOP_BODY23]], label %[[FREM_LOOP_EXIT24]] +; CHECK: [[FREM_LOOP_EXIT24]]: +; CHECK-NEXT: [[AX_EXIT_PHI34:%.*]] = phi float [ [[AX18]], %[[FREM_COMPUTE15]] ], [ [[AX_LOOP_PHI26]], %[[FREM_LOOP_BODY23]] ] +; CHECK-NEXT: [[NB_EXIT_PHI35:%.*]] = phi i32 [ [[NB_IV25]], %[[FREM_LOOP_BODY23]] ], [ [[NB21]], %[[FREM_COMPUTE15]] ] +; CHECK-NEXT: [[TMP51:%.*]] = sub i32 [[NB_EXIT_PHI35]], 12 +; CHECK-NEXT: [[TMP52:%.*]] = add i32 [[TMP51]], 1 +; CHECK-NEXT: [[AX36:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX_EXIT_PHI34]], i32 [[TMP52]]) +; CHECK-NEXT: [[TMP53:%.*]] = fmul float [[AX36]], [[AYINV22]] +; CHECK-NEXT: [[Q37:%.*]] = call float @llvm.rint.f32(float [[TMP53]]) +; CHECK-NEXT: [[TMP54:%.*]] = fneg float [[Q37]] +; CHECK-NEXT: [[AX38:%.*]] = call float @llvm.fma.f32(float [[TMP54]], float [[AY20]], float [[AX36]]) +; CHECK-NEXT: [[CLT39:%.*]] = fcmp olt float [[AX38]], 0.000000e+00 +; CHECK-NEXT: [[AXP40:%.*]] = fadd float [[AX38]], [[AY20]] +; CHECK-NEXT: [[AX41:%.*]] = select i1 [[CLT39]], float [[AXP40]], float [[AX38]] +; CHECK-NEXT: [[AX42:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX41]], i32 [[EY19]]) +; CHECK-NEXT: [[TMP55]] = call float @llvm.copysign.f32(float [[AX42]], float [[TMP11]]) +; CHECK-NEXT: br label %[[BB14]] +; + ptr addrspace(1) %in2) { + %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4 + %r0 = load <2 x float>, ptr addrspace(1) %in1, align 8 + %r1 = load <2 x float>, ptr addrspace(1) %gep2, align 8 + %r2 = frem <2 x float> %r0, %r1 + store <2 x float> %r2, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; CHECK-LABEL: define amdgpu_kernel void @frem_v4f32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]]) { +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN2]], i32 4 +; CHECK-NEXT: [[R0:%.*]] = load <4 x float>, ptr addrspace(1) [[IN1]], align 16 +; CHECK-NEXT: [[R1:%.*]] = load <4 x float>, ptr addrspace(1) [[GEP2]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[R0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[R1]], i64 0 +; CHECK-NEXT: [[AX:%.*]] = call float @llvm.fabs.f32(float [[TMP1]]) +; CHECK-NEXT: [[AY:%.*]] = call float @llvm.fabs.f32(float [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt float [[AX]], [[AY]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] +; CHECK: [[BB4:.*]]: +; CHECK-NEXT: [[RET:%.*]] = phi float [ [[TMP57:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP49:%.*]], %[[FREM_ELSE]] ] +; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq float [[TMP2]], 0.000000e+00 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float 0x7FF8000000000000, float [[RET]] +; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.fabs.f32(float [[TMP1]]) +; CHECK-NEXT: [[TMP8:%.*]] = fcmp ult float [[TMP7]], 0x7FF0000000000000 +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float 0x7FF8000000000000 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[R0]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[R1]], i64 1 +; CHECK-NEXT: [[AX12:%.*]] = call float @llvm.fabs.f32(float [[TMP11]]) +; CHECK-NEXT: [[AY13:%.*]] = call float @llvm.fabs.f32(float [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt float [[AX12]], [[AY13]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]] +; CHECK: [[BB14:.*]]: +; CHECK-NEXT: [[RET14:%.*]] = phi float [ [[TMP75:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP67:%.*]], %[[FREM_ELSE16]] ] +; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq float [[TMP12]], 0.000000e+00 +; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], float 0x7FF8000000000000, float [[RET14]] +; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fabs.f32(float [[TMP11]]) +; CHECK-NEXT: [[TMP18:%.*]] = fcmp ult float [[TMP17]], 0x7FF0000000000000 +; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], float [[TMP16]], float 0x7FF8000000000000 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP19]], i64 1 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[R0]], i64 2 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[R1]], i64 2 +; CHECK-NEXT: [[AX43:%.*]] = call float @llvm.fabs.f32(float [[TMP21]]) +; CHECK-NEXT: [[AY44:%.*]] = call float @llvm.fabs.f32(float [[TMP22]]) +; CHECK-NEXT: [[TMP23:%.*]] = fcmp ogt float [[AX43]], [[AY44]] +; CHECK-NEXT: br i1 [[TMP23]], label %[[FREM_COMPUTE46:.*]], label %[[FREM_ELSE47:.*]] +; CHECK: [[BB24:.*]]: +; CHECK-NEXT: [[RET45:%.*]] = phi float [ [[TMP93:%.*]], %[[FREM_LOOP_EXIT55:.*]] ], [ [[TMP85:%.*]], %[[FREM_ELSE47]] ] +; CHECK-NEXT: [[TMP25:%.*]] = fcmp ueq float [[TMP22]], 0.000000e+00 +; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float 0x7FF8000000000000, float [[RET45]] +; CHECK-NEXT: [[TMP27:%.*]] = call float @llvm.fabs.f32(float [[TMP21]]) +; CHECK-NEXT: [[TMP28:%.*]] = fcmp ult float [[TMP27]], 0x7FF0000000000000 +; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float 0x7FF8000000000000 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP29]], i64 2 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x float> [[R0]], i64 3 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x float> [[R1]], i64 3 +; CHECK-NEXT: [[AX74:%.*]] = call float @llvm.fabs.f32(float [[TMP31]]) +; CHECK-NEXT: [[AY75:%.*]] = call float @llvm.fabs.f32(float [[TMP32]]) +; CHECK-NEXT: [[TMP33:%.*]] = fcmp ogt float [[AX74]], [[AY75]] +; CHECK-NEXT: br i1 [[TMP33]], label %[[FREM_COMPUTE77:.*]], label %[[FREM_ELSE78:.*]] +; CHECK: [[BB34:.*]]: +; CHECK-NEXT: [[RET76:%.*]] = phi float [ [[TMP111:%.*]], %[[FREM_LOOP_EXIT86:.*]] ], [ [[TMP103:%.*]], %[[FREM_ELSE78]] ] +; CHECK-NEXT: [[TMP35:%.*]] = fcmp ueq float [[TMP32]], 0.000000e+00 +; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], float 0x7FF8000000000000, float [[RET76]] +; CHECK-NEXT: [[TMP37:%.*]] = call float @llvm.fabs.f32(float [[TMP31]]) +; CHECK-NEXT: [[TMP38:%.*]] = fcmp ult float [[TMP37]], 0x7FF0000000000000 +; CHECK-NEXT: [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float 0x7FF8000000000000 +; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP39]], i64 3 +; CHECK-NEXT: store <4 x float> [[R2]], ptr addrspace(1) [[OUT]], align 16 +; CHECK-NEXT: ret void +; CHECK: [[FREM_COMPUTE]]: +; CHECK-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX]]) +; CHECK-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP40]], 0 +; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { float, i32 } [[TMP40]], 1 +; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP42]], 1 +; CHECK-NEXT: [[AX1:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP41]], i32 12) +; CHECK-NEXT: [[TMP43:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY]]) +; CHECK-NEXT: [[TMP44:%.*]] = extractvalue { float, i32 } [[TMP43]], 0 +; CHECK-NEXT: [[TMP45:%.*]] = extractvalue { float, i32 } [[TMP43]], 1 +; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP45]], 1 +; CHECK-NEXT: [[AY2:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP44]], i32 1) +; CHECK-NEXT: [[NB:%.*]] = sub i32 [[EX]], [[EY]] +; CHECK-NEXT: [[AYINV:%.*]] = fdiv float 1.000000e+00, [[AY2]] +; CHECK-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[NB]], 12 +; CHECK-NEXT: br i1 [[TMP46]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]] +; CHECK: [[FREM_ELSE]]: +; CHECK-NEXT: [[TMP47:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]]) +; CHECK-NEXT: [[TMP48:%.*]] = fcmp oeq float [[AX]], [[AY]] +; CHECK-NEXT: [[TMP49]] = select i1 [[TMP48]], float [[TMP47]], float [[TMP1]] +; CHECK-NEXT: br label %[[BB4]] +; CHECK: [[FREM_LOOP_BODY]]: +; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[TMP50:%.*]] = fmul float [[AX_LOOP_PHI]], [[AYINV]] +; CHECK-NEXT: [[Q:%.*]] = call float @llvm.rint.f32(float [[TMP50]]) +; CHECK-NEXT: [[TMP51:%.*]] = fneg float [[Q]] +; CHECK-NEXT: [[AX3:%.*]] = call float @llvm.fma.f32(float [[TMP51]], float [[AY2]], float [[AX_LOOP_PHI]]) +; CHECK-NEXT: [[CLT:%.*]] = fcmp olt float [[AX3]], 0.000000e+00 +; CHECK-NEXT: [[AXP:%.*]] = fadd float [[AX3]], [[AY2]] +; CHECK-NEXT: [[AX4:%.*]] = select i1 [[CLT]], float [[AXP]], float [[AX3]] +; CHECK-NEXT: [[AX_UPDATE]] = call float @llvm.ldexp.f32.i32(float [[AX4]], i32 12) +; CHECK-NEXT: [[NB_UPDATE]] = sub i32 [[NB_IV]], 12 +; CHECK-NEXT: [[TMP52:%.*]] = icmp sgt i32 [[NB_IV]], 12 +; CHECK-NEXT: br i1 [[TMP52]], label %[[FREM_LOOP_BODY]], label %[[FREM_LOOP_EXIT]] +; CHECK: [[FREM_LOOP_EXIT]]: +; CHECK-NEXT: [[AX_EXIT_PHI:%.*]] = phi float [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_LOOP_PHI]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[NB_EXIT_PHI:%.*]] = phi i32 [ [[NB_IV]], %[[FREM_LOOP_BODY]] ], [ [[NB]], %[[FREM_COMPUTE]] ] +; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[NB_EXIT_PHI]], 12 +; CHECK-NEXT: [[TMP54:%.*]] = add i32 [[TMP53]], 1 +; CHECK-NEXT: [[AX5:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX_EXIT_PHI]], i32 [[TMP54]]) +; CHECK-NEXT: [[TMP55:%.*]] = fmul float [[AX5]], [[AYINV]] +; CHECK-NEXT: [[Q6:%.*]] = call float @llvm.rint.f32(float [[TMP55]]) +; CHECK-NEXT: [[TMP56:%.*]] = fneg float [[Q6]] +; CHECK-NEXT: [[AX7:%.*]] = call float @llvm.fma.f32(float [[TMP56]], float [[AY2]], float [[AX5]]) +; CHECK-NEXT: [[CLT8:%.*]] = fcmp olt float [[AX7]], 0.000000e+00 +; CHECK-NEXT: [[AXP9:%.*]] = fadd float [[AX7]], [[AY2]] +; CHECK-NEXT: [[AX10:%.*]] = select i1 [[CLT8]], float [[AXP9]], float [[AX7]] +; CHECK-NEXT: [[AX11:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX10]], i32 [[EY]]) +; CHECK-NEXT: [[TMP57]] = call float @llvm.copysign.f32(float [[AX11]], float [[TMP1]]) +; CHECK-NEXT: br label %[[BB4]] +; CHECK: [[FREM_COMPUTE15]]: +; CHECK-NEXT: [[TMP58:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX12]]) +; CHECK-NEXT: [[TMP59:%.*]] = extractvalue { float, i32 } [[TMP58]], 0 +; CHECK-NEXT: [[TMP60:%.*]] = extractvalue { float, i32 } [[TMP58]], 1 +; CHECK-NEXT: [[EX17:%.*]] = sub i32 [[TMP60]], 1 +; CHECK-NEXT: [[AX18:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP59]], i32 12) +; CHECK-NEXT: [[TMP61:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY13]]) +; CHECK-NEXT: [[TMP62:%.*]] = extractvalue { float, i32 } [[TMP61]], 0 +; CHECK-NEXT: [[TMP63:%.*]] = extractvalue { float, i32 } [[TMP61]], 1 +; CHECK-NEXT: [[EY19:%.*]] = sub i32 [[TMP63]], 1 +; CHECK-NEXT: [[AY20:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP62]], i32 1) +; CHECK-NEXT: [[NB21:%.*]] = sub i32 [[EX17]], [[EY19]] +; CHECK-NEXT: [[AYINV22:%.*]] = fdiv float 1.000000e+00, [[AY20]] +; CHECK-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[NB21]], 12 +; CHECK-NEXT: br i1 [[TMP64]], label %[[FREM_LOOP_BODY23:.*]], label %[[FREM_LOOP_EXIT24]] +; CHECK: [[FREM_ELSE16]]: +; CHECK-NEXT: [[TMP65:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP11]]) +; CHECK-NEXT: [[TMP66:%.*]] = fcmp oeq float [[AX12]], [[AY13]] +; CHECK-NEXT: [[TMP67]] = select i1 [[TMP66]], float [[TMP65]], float [[TMP11]] +; CHECK-NEXT: br label %[[BB14]] +; CHECK: [[FREM_LOOP_BODY23]]: +; CHECK-NEXT: [[NB_IV25:%.*]] = phi i32 [ [[NB21]], %[[FREM_COMPUTE15]] ], [ [[NB_UPDATE33:%.*]], %[[FREM_LOOP_BODY23]] ] +; CHECK-NEXT: [[AX_LOOP_PHI26:%.*]] = phi float [ [[AX18]], %[[FREM_COMPUTE15]] ], [ [[AX_UPDATE32:%.*]], %[[FREM_LOOP_BODY23]] ] +; CHECK-NEXT: [[TMP68:%.*]] = fmul float [[AX_LOOP_PHI26]], [[AYINV22]] +; CHECK-NEXT: [[Q27:%.*]] = call float @llvm.rint.f32(float [[TMP68]]) +; CHECK-NEXT: [[TMP69:%.*]] = fneg float [[Q27]] +; CHECK-NEXT: [[AX28:%.*]] = call float @llvm.fma.f32(float [[TMP69]], float [[AY20]], float [[AX_LOOP_PHI26]]) +; CHECK-NEXT: [[CLT29:%.*]] = fcmp olt float [[AX28]], 0.000000e+00 +; CHECK-NEXT: [[AXP30:%.*]] = fadd float [[AX28]], [[AY20]] +; CHECK-NEXT: [[AX31:%.*]] = select i1 [[CLT29]], float [[AXP30]], float [[AX28]] +; CHECK-NEXT: [[AX_UPDATE32]] = call float @llvm.ldexp.f32.i32(float [[AX31]], i32 12) +; CHECK-NEXT: [[NB_UPDATE33]] = sub i32 [[NB_IV25]], 12 +; CHECK-NEXT: [[TMP70:%.*]] = icmp sgt i32 [[NB_IV25]], 12 +; CHECK-NEXT: br i1 [[TMP70]], label %[[FREM_LOOP_BODY23]], label %[[FREM_LOOP_EXIT24]] +; CHECK: [[FREM_LOOP_EXIT24]]: +; CHECK-NEXT: [[AX_EXIT_PHI34:%.*]] = phi float [ [[AX18]], %[[FREM_COMPUTE15]] ], [ [[AX_LOOP_PHI26]], %[[FREM_LOOP_BODY23]] ] +; CHECK-NEXT: [[NB_EXIT_PHI35:%.*]] = phi i32 [ [[NB_IV25]], %[[FREM_LOOP_BODY23]] ], [ [[NB21]], %[[FREM_COMPUTE15]] ] +; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[NB_EXIT_PHI35]], 12 +; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP71]], 1 +; CHECK-NEXT: [[AX36:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX_EXIT_PHI34]], i32 [[TMP72]]) +; CHECK-NEXT: [[TMP73:%.*]] = fmul float [[AX36]], [[AYINV22]] +; CHECK-NEXT: [[Q37:%.*]] = call float @llvm.rint.f32(float [[TMP73]]) +; CHECK-NEXT: [[TMP74:%.*]] = fneg float [[Q37]] +; CHECK-NEXT: [[AX38:%.*]] = call float @llvm.fma.f32(float [[TMP74]], float [[AY20]], float [[AX36]]) +; CHECK-NEXT: [[CLT39:%.*]] = fcmp olt float [[AX38]], 0.000000e+00 +; CHECK-NEXT: [[AXP40:%.*]] = fadd float [[AX38]], [[AY20]] +; CHECK-NEXT: [[AX41:%.*]] = select i1 [[CLT39]], float [[AXP40]], float [[AX38]] +; CHECK-NEXT: [[AX42:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX41]], i32 [[EY19]]) +; CHECK-NEXT: [[TMP75]] = call float @llvm.copysign.f32(float [[AX42]], float [[TMP11]]) +; CHECK-NEXT: br label %[[BB14]] +; CHECK: [[FREM_COMPUTE46]]: +; CHECK-NEXT: [[TMP76:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX43]]) +; CHECK-NEXT: [[TMP77:%.*]] = extractvalue { float, i32 } [[TMP76]], 0 +; CHECK-NEXT: [[TMP78:%.*]] = extractvalue { float, i32 } [[TMP76]], 1 +; CHECK-NEXT: [[EX48:%.*]] = sub i32 [[TMP78]], 1 +; CHECK-NEXT: [[AX49:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP77]], i32 12) +; CHECK-NEXT: [[TMP79:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY44]]) +; CHECK-NEXT: [[TMP80:%.*]] = extractvalue { float, i32 } [[TMP79]], 0 +; CHECK-NEXT: [[TMP81:%.*]] = extractvalue { float, i32 } [[TMP79]], 1 +; CHECK-NEXT: [[EY50:%.*]] = sub i32 [[TMP81]], 1 +; CHECK-NEXT: [[AY51:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP80]], i32 1) +; CHECK-NEXT: [[NB52:%.*]] = sub i32 [[EX48]], [[EY50]] +; CHECK-NEXT: [[AYINV53:%.*]] = fdiv float 1.000000e+00, [[AY51]] +; CHECK-NEXT: [[TMP82:%.*]] = icmp sgt i32 [[NB52]], 12 +; CHECK-NEXT: br i1 [[TMP82]], label %[[FREM_LOOP_BODY54:.*]], label %[[FREM_LOOP_EXIT55]] +; CHECK: [[FREM_ELSE47]]: +; CHECK-NEXT: [[TMP83:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP21]]) +; CHECK-NEXT: [[TMP84:%.*]] = fcmp oeq float [[AX43]], [[AY44]] +; CHECK-NEXT: [[TMP85]] = select i1 [[TMP84]], float [[TMP83]], float [[TMP21]] +; CHECK-NEXT: br label %[[BB24]] +; CHECK: [[FREM_LOOP_BODY54]]: +; CHECK-NEXT: [[NB_IV56:%.*]] = phi i32 [ [[NB52]], %[[FREM_COMPUTE46]] ], [ [[NB_UPDATE64:%.*]], %[[FREM_LOOP_BODY54]] ] +; CHECK-NEXT: [[AX_LOOP_PHI57:%.*]] = phi float [ [[AX49]], %[[FREM_COMPUTE46]] ], [ [[AX_UPDATE63:%.*]], %[[FREM_LOOP_BODY54]] ] +; CHECK-NEXT: [[TMP86:%.*]] = fmul float [[AX_LOOP_PHI57]], [[AYINV53]] +; CHECK-NEXT: [[Q58:%.*]] = call float @llvm.rint.f32(float [[TMP86]]) +; CHECK-NEXT: [[TMP87:%.*]] = fneg float [[Q58]] +; CHECK-NEXT: [[AX59:%.*]] = call float @llvm.fma.f32(float [[TMP87]], float [[AY51]], float [[AX_LOOP_PHI57]]) +; CHECK-NEXT: [[CLT60:%.*]] = fcmp olt float [[AX59]], 0.000000e+00 +; CHECK-NEXT: [[AXP61:%.*]] = fadd float [[AX59]], [[AY51]] +; CHECK-NEXT: [[AX62:%.*]] = select i1 [[CLT60]], float [[AXP61]], float [[AX59]] +; CHECK-NEXT: [[AX_UPDATE63]] = call float @llvm.ldexp.f32.i32(float [[AX62]], i32 12) +; CHECK-NEXT: [[NB_UPDATE64]] = sub i32 [[NB_IV56]], 12 +; CHECK-NEXT: [[TMP88:%.*]] = icmp sgt i32 [[NB_IV56]], 12 +; CHECK-NEXT: br i1 [[TMP88]], label %[[FREM_LOOP_BODY54]], label %[[FREM_LOOP_EXIT55]] +; CHECK: [[FREM_LOOP_EXIT55]]: +; CHECK-NEXT: [[AX_EXIT_PHI65:%.*]] = phi float [ [[AX49]], %[[FREM_COMPUTE46]] ], [ [[AX_LOOP_PHI57]], %[[FREM_LOOP_BODY54]] ] +; CHECK-NEXT: [[NB_EXIT_PHI66:%.*]] = phi i32 [ [[NB_IV56]], %[[FREM_LOOP_BODY54]] ], [ [[NB52]], %[[FREM_COMPUTE46]] ] +; CHECK-NEXT: [[TMP89:%.*]] = sub i32 [[NB_EXIT_PHI66]], 12 +; CHECK-NEXT: [[TMP90:%.*]] = add i32 [[TMP89]], 1 +; CHECK-NEXT: [[AX67:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX_EXIT_PHI65]], i32 [[TMP90]]) +; CHECK-NEXT: [[TMP91:%.*]] = fmul float [[AX67]], [[AYINV53]] +; CHECK-NEXT: [[Q68:%.*]] = call float @llvm.rint.f32(float [[TMP91]]) +; CHECK-NEXT: [[TMP92:%.*]] = fneg float [[Q68]] +; CHECK-NEXT: [[AX69:%.*]] = call float @llvm.fma.f32(float [[TMP92]], float [[AY51]], float [[AX67]]) +; CHECK-NEXT: [[CLT70:%.*]] = fcmp olt float [[AX69]], 0.000000e+00 +; CHECK-NEXT: [[AXP71:%.*]] = fadd float [[AX69]], [[AY51]] +; CHECK-NEXT: [[AX72:%.*]] = select i1 [[CLT70]], float [[AXP71]], float [[AX69]] +; CHECK-NEXT: [[AX73:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX72]], i32 [[EY50]]) +; CHECK-NEXT: [[TMP93]] = call float @llvm.copysign.f32(float [[AX73]], float [[TMP21]]) +; CHECK-NEXT: br label %[[BB24]] +; CHECK: [[FREM_COMPUTE77]]: +; CHECK-NEXT: [[TMP94:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX74]]) +; CHECK-NEXT: [[TMP95:%.*]] = extractvalue { float, i32 } [[TMP94]], 0 +; CHECK-NEXT: [[TMP96:%.*]] = extractvalue { float, i32 } [[TMP94]], 1 +; CHECK-NEXT: [[EX79:%.*]] = sub i32 [[TMP96]], 1 +; CHECK-NEXT: [[AX80:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP95]], i32 12) +; CHECK-NEXT: [[TMP97:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY75]]) +; CHECK-NEXT: [[TMP98:%.*]] = extractvalue { float, i32 } [[TMP97]], 0 +; CHECK-NEXT: [[TMP99:%.*]] = extractvalue { float, i32 } [[TMP97]], 1 +; CHECK-NEXT: [[EY81:%.*]] = sub i32 [[TMP99]], 1 +; CHECK-NEXT: [[AY82:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP98]], i32 1) +; CHECK-NEXT: [[NB83:%.*]] = sub i32 [[EX79]], [[EY81]] +; CHECK-NEXT: [[AYINV84:%.*]] = fdiv float 1.000000e+00, [[AY82]] +; CHECK-NEXT: [[TMP100:%.*]] = icmp sgt i32 [[NB83]], 12 +; CHECK-NEXT: br i1 [[TMP100]], label %[[FREM_LOOP_BODY85:.*]], label %[[FREM_LOOP_EXIT86]] +; CHECK: [[FREM_ELSE78]]: +; CHECK-NEXT: [[TMP101:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP31]]) +; CHECK-NEXT: [[TMP102:%.*]] = fcmp oeq float [[AX74]], [[AY75]] +; CHECK-NEXT: [[TMP103]] = select i1 [[TMP102]], float [[TMP101]], float [[TMP31]] +; CHECK-NEXT: br label %[[BB34]] +; CHECK: [[FREM_LOOP_BODY85]]: +; CHECK-NEXT: [[NB_IV87:%.*]] = phi i32 [ [[NB83]], %[[FREM_COMPUTE77]] ], [ [[NB_UPDATE95:%.*]], %[[FREM_LOOP_BODY85]] ] +; CHECK-NEXT: [[AX_LOOP_PHI88:%.*]] = phi float [ [[AX80]], %[[FREM_COMPUTE77]] ], [ [[AX_UPDATE94:%.*]], %[[FREM_LOOP_BODY85]] ] +; CHECK-NEXT: [[TMP104:%.*]] = fmul float [[AX_LOOP_PHI88]], [[AYINV84]] +; CHECK-NEXT: [[Q89:%.*]] = call float @llvm.rint.f32(float [[TMP104]]) +; CHECK-NEXT: [[TMP105:%.*]] = fneg float [[Q89]] +; CHECK-NEXT: [[AX90:%.*]] = call float @llvm.fma.f32(float [[TMP105]], float [[AY82]], float [[AX_LOOP_PHI88]]) +; CHECK-NEXT: [[CLT91:%.*]] = fcmp olt float [[AX90]], 0.000000e+00 +; CHECK-NEXT: [[AXP92:%.*]] = fadd float [[AX90]], [[AY82]] +; CHECK-NEXT: [[AX93:%.*]] = select i1 [[CLT91]], float [[AXP92]], float [[AX90]] +; CHECK-NEXT: [[AX_UPDATE94]] = call float @llvm.ldexp.f32.i32(float [[AX93]], i32 12) +; CHECK-NEXT: [[NB_UPDATE95]] = sub i32 [[NB_IV87]], 12 +; CHECK-NEXT: [[TMP106:%.*]] = icmp sgt i32 [[NB_IV87]], 12 +; CHECK-NEXT: br i1 [[TMP106]], label %[[FREM_LOOP_BODY85]], label %[[FREM_LOOP_EXIT86]] +; CHECK: [[FREM_LOOP_EXIT86]]: +; CHECK-NEXT: [[AX_EXIT_PHI96:%.*]] = phi float [ [[AX80]], %[[FREM_COMPUTE77]] ], [ [[AX_LOOP_PHI88]], %[[FREM_LOOP_BODY85]] ] +; CHECK-NEXT: [[NB_EXIT_PHI97:%.*]] = phi i32 [ [[NB_IV87]], %[[FREM_LOOP_BODY85]] ], [ [[NB83]], %[[FREM_COMPUTE77]] ] +; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[NB_EXIT_PHI97]], 12 +; CHECK-NEXT: [[TMP108:%.*]] = add i32 [[TMP107]], 1 +; CHECK-NEXT: [[AX98:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX_EXIT_PHI96]], i32 [[TMP108]]) +; CHECK-NEXT: [[TMP109:%.*]] = fmul float [[AX98]], [[AYINV84]] +; CHECK-NEXT: [[Q99:%.*]] = call float @llvm.rint.f32(float [[TMP109]]) +; CHECK-NEXT: [[TMP110:%.*]] = fneg float [[Q99]] +; CHECK-NEXT: [[AX100:%.*]] = call float @llvm.fma.f32(float [[TMP110]], float [[AY82]], float [[AX98]]) +; CHECK-NEXT: [[CLT101:%.*]] = fcmp olt float [[AX100]], 0.000000e+00 +; CHECK-NEXT: [[AXP102:%.*]] = fadd float [[AX100]], [[AY82]] +; CHECK-NEXT: [[AX103:%.*]] = select i1 [[CLT101]], float [[AXP102]], float [[AX100]] +; CHECK-NEXT: [[AX104:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX103]], i32 [[EY81]]) +; CHECK-NEXT: [[TMP111]] = call float @llvm.copysign.f32(float [[AX104]], float [[TMP31]]) +; CHECK-NEXT: br label %[[BB34]] +; + ptr addrspace(1) %in2) { + %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4 + %r0 = load <4 x float>, ptr addrspace(1) %in1, align 16 + %r1 = load <4 x float>, ptr addrspace(1) %gep2, align 16 + %r2 = frem <4 x float> %r0, %r1 + store <4 x float> %r2, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; CHECK-LABEL: define amdgpu_kernel void @frem_v2f64( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]]) { +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr <2 x double>, ptr addrspace(1) [[IN2]], i32 4 +; CHECK-NEXT: [[R0:%.*]] = load <2 x double>, ptr addrspace(1) [[IN1]], align 16 +; CHECK-NEXT: [[R1:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP2]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[R0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[R1]], i64 0 +; CHECK-NEXT: [[AX:%.*]] = call double @llvm.fabs.f64(double [[TMP1]]) +; CHECK-NEXT: [[AY:%.*]] = call double @llvm.fabs.f64(double [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt double [[AX]], [[AY]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] +; CHECK: [[BB4:.*]]: +; CHECK-NEXT: [[RET:%.*]] = phi double [ [[TMP37:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP29:%.*]], %[[FREM_ELSE]] ] +; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq double [[TMP2]], 0.000000e+00 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double 0x7FF8000000000000, double [[RET]] +; CHECK-NEXT: [[TMP7:%.*]] = call double @llvm.fabs.f64(double [[TMP1]]) +; CHECK-NEXT: [[TMP8:%.*]] = fcmp ult double [[TMP7]], 0x7FF0000000000000 +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], double [[TMP6]], double 0x7FF8000000000000 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[R0]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[R1]], i64 1 +; CHECK-NEXT: [[AX12:%.*]] = call double @llvm.fabs.f64(double [[TMP11]]) +; CHECK-NEXT: [[AY13:%.*]] = call double @llvm.fabs.f64(double [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt double [[AX12]], [[AY13]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]] +; CHECK: [[BB14:.*]]: +; CHECK-NEXT: [[RET14:%.*]] = phi double [ [[TMP55:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP47:%.*]], %[[FREM_ELSE16]] ] +; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq double [[TMP12]], 0.000000e+00 +; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], double 0x7FF8000000000000, double [[RET14]] +; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.fabs.f64(double [[TMP11]]) +; CHECK-NEXT: [[TMP18:%.*]] = fcmp ult double [[TMP17]], 0x7FF0000000000000 +; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], double [[TMP16]], double 0x7FF8000000000000 +; CHECK-NEXT: [[R2:%.*]] = insertelement <2 x double> [[TMP10]], double [[TMP19]], i64 1 +; CHECK-NEXT: store <2 x double> [[R2]], ptr addrspace(1) [[OUT]], align 16 +; CHECK-NEXT: ret void +; CHECK: [[FREM_COMPUTE]]: +; CHECK-NEXT: [[TMP20:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AX]]) +; CHECK-NEXT: [[TMP21:%.*]] = extractvalue { double, i32 } [[TMP20]], 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractvalue { double, i32 } [[TMP20]], 1 +; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP22]], 1 +; CHECK-NEXT: [[AX1:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP21]], i32 26) +; CHECK-NEXT: [[TMP23:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AY]]) +; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { double, i32 } [[TMP23]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { double, i32 } [[TMP23]], 1 +; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP25]], 1 +; CHECK-NEXT: [[AY2:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP24]], i32 1) +; CHECK-NEXT: [[NB:%.*]] = sub i32 [[EX]], [[EY]] +; CHECK-NEXT: [[AYINV:%.*]] = fdiv double 1.000000e+00, [[AY2]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp sgt i32 [[NB]], 26 +; CHECK-NEXT: br i1 [[TMP26]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]] +; CHECK: [[FREM_ELSE]]: +; CHECK-NEXT: [[TMP27:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[TMP1]]) +; CHECK-NEXT: [[TMP28:%.*]] = fcmp oeq double [[AX]], [[AY]] +; CHECK-NEXT: [[TMP29]] = select i1 [[TMP28]], double [[TMP27]], double [[TMP1]] +; CHECK-NEXT: br label %[[BB4]] +; CHECK: [[FREM_LOOP_BODY]]: +; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi double [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[TMP30:%.*]] = fmul double [[AX_LOOP_PHI]], [[AYINV]] +; CHECK-NEXT: [[Q:%.*]] = call double @llvm.rint.f64(double [[TMP30]]) +; CHECK-NEXT: [[TMP31:%.*]] = fneg double [[Q]] +; CHECK-NEXT: [[AX3:%.*]] = call double @llvm.fma.f64(double [[TMP31]], double [[AY2]], double [[AX_LOOP_PHI]]) +; CHECK-NEXT: [[CLT:%.*]] = fcmp olt double [[AX3]], 0.000000e+00 +; CHECK-NEXT: [[AXP:%.*]] = fadd double [[AX3]], [[AY2]] +; CHECK-NEXT: [[AX4:%.*]] = select i1 [[CLT]], double [[AXP]], double [[AX3]] +; CHECK-NEXT: [[AX_UPDATE]] = call double @llvm.ldexp.f64.i32(double [[AX4]], i32 26) +; CHECK-NEXT: [[NB_UPDATE]] = sub i32 [[NB_IV]], 26 +; CHECK-NEXT: [[TMP32:%.*]] = icmp sgt i32 [[NB_IV]], 26 +; CHECK-NEXT: br i1 [[TMP32]], label %[[FREM_LOOP_BODY]], label %[[FREM_LOOP_EXIT]] +; CHECK: [[FREM_LOOP_EXIT]]: +; CHECK-NEXT: [[AX_EXIT_PHI:%.*]] = phi double [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_LOOP_PHI]], %[[FREM_LOOP_BODY]] ] +; CHECK-NEXT: [[NB_EXIT_PHI:%.*]] = phi i32 [ [[NB_IV]], %[[FREM_LOOP_BODY]] ], [ [[NB]], %[[FREM_COMPUTE]] ] +; CHECK-NEXT: [[TMP33:%.*]] = sub i32 [[NB_EXIT_PHI]], 26 +; CHECK-NEXT: [[TMP34:%.*]] = add i32 [[TMP33]], 1 +; CHECK-NEXT: [[AX5:%.*]] = call double @llvm.ldexp.f64.i32(double [[AX_EXIT_PHI]], i32 [[TMP34]]) +; CHECK-NEXT: [[TMP35:%.*]] = fmul double [[AX5]], [[AYINV]] +; CHECK-NEXT: [[Q6:%.*]] = call double @llvm.rint.f64(double [[TMP35]]) +; CHECK-NEXT: [[TMP36:%.*]] = fneg double [[Q6]] +; CHECK-NEXT: [[AX7:%.*]] = call double @llvm.fma.f64(double [[TMP36]], double [[AY2]], double [[AX5]]) +; CHECK-NEXT: [[CLT8:%.*]] = fcmp olt double [[AX7]], 0.000000e+00 +; CHECK-NEXT: [[AXP9:%.*]] = fadd double [[AX7]], [[AY2]] +; CHECK-NEXT: [[AX10:%.*]] = select i1 [[CLT8]], double [[AXP9]], double [[AX7]] +; CHECK-NEXT: [[AX11:%.*]] = call double @llvm.ldexp.f64.i32(double [[AX10]], i32 [[EY]]) +; CHECK-NEXT: [[TMP37]] = call double @llvm.copysign.f64(double [[AX11]], double [[TMP1]]) +; CHECK-NEXT: br label %[[BB4]] +; CHECK: [[FREM_COMPUTE15]]: +; CHECK-NEXT: [[TMP38:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AX12]]) +; CHECK-NEXT: [[TMP39:%.*]] = extractvalue { double, i32 } [[TMP38]], 0 +; CHECK-NEXT: [[TMP40:%.*]] = extractvalue { double, i32 } [[TMP38]], 1 +; CHECK-NEXT: [[EX17:%.*]] = sub i32 [[TMP40]], 1 +; CHECK-NEXT: [[AX18:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP39]], i32 26) +; CHECK-NEXT: [[TMP41:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AY13]]) +; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { double, i32 } [[TMP41]], 0 +; CHECK-NEXT: [[TMP43:%.*]] = extractvalue { double, i32 } [[TMP41]], 1 +; CHECK-NEXT: [[EY19:%.*]] = sub i32 [[TMP43]], 1 +; CHECK-NEXT: [[AY20:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP42]], i32 1) +; CHECK-NEXT: [[NB21:%.*]] = sub i32 [[EX17]], [[EY19]] +; CHECK-NEXT: [[AYINV22:%.*]] = fdiv double 1.000000e+00, [[AY20]] +; CHECK-NEXT: [[TMP44:%.*]] = icmp sgt i32 [[NB21]], 26 +; CHECK-NEXT: br i1 [[TMP44]], label %[[FREM_LOOP_BODY23:.*]], label %[[FREM_LOOP_EXIT24]] +; CHECK: [[FREM_ELSE16]]: +; CHECK-NEXT: [[TMP45:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[TMP11]]) +; CHECK-NEXT: [[TMP46:%.*]] = fcmp oeq double [[AX12]], [[AY13]] +; CHECK-NEXT: [[TMP47]] = select i1 [[TMP46]], double [[TMP45]], double [[TMP11]] +; CHECK-NEXT: br label %[[BB14]] +; CHECK: [[FREM_LOOP_BODY23]]: +; CHECK-NEXT: [[NB_IV25:%.*]] = phi i32 [ [[NB21]], %[[FREM_COMPUTE15]] ], [ [[NB_UPDATE33:%.*]], %[[FREM_LOOP_BODY23]] ] +; CHECK-NEXT: [[AX_LOOP_PHI26:%.*]] = phi double [ [[AX18]], %[[FREM_COMPUTE15]] ], [ [[AX_UPDATE32:%.*]], %[[FREM_LOOP_BODY23]] ] +; CHECK-NEXT: [[TMP48:%.*]] = fmul double [[AX_LOOP_PHI26]], [[AYINV22]] +; CHECK-NEXT: [[Q27:%.*]] = call double @llvm.rint.f64(double [[TMP48]]) +; CHECK-NEXT: [[TMP49:%.*]] = fneg double [[Q27]] +; CHECK-NEXT: [[AX28:%.*]] = call double @llvm.fma.f64(double [[TMP49]], double [[AY20]], double [[AX_LOOP_PHI26]]) +; CHECK-NEXT: [[CLT29:%.*]] = fcmp olt double [[AX28]], 0.000000e+00 +; CHECK-NEXT: [[AXP30:%.*]] = fadd double [[AX28]], [[AY20]] +; CHECK-NEXT: [[AX31:%.*]] = select i1 [[CLT29]], double [[AXP30]], double [[AX28]] +; CHECK-NEXT: [[AX_UPDATE32]] = call double @llvm.ldexp.f64.i32(double [[AX31]], i32 26) +; CHECK-NEXT: [[NB_UPDATE33]] = sub i32 [[NB_IV25]], 26 +; CHECK-NEXT: [[TMP50:%.*]] = icmp sgt i32 [[NB_IV25]], 26 +; CHECK-NEXT: br i1 [[TMP50]], label %[[FREM_LOOP_BODY23]], label %[[FREM_LOOP_EXIT24]] +; CHECK: [[FREM_LOOP_EXIT24]]: +; CHECK-NEXT: [[AX_EXIT_PHI34:%.*]] = phi double [ [[AX18]], %[[FREM_COMPUTE15]] ], [ [[AX_LOOP_PHI26]], %[[FREM_LOOP_BODY23]] ] +; CHECK-NEXT: [[NB_EXIT_PHI35:%.*]] = phi i32 [ [[NB_IV25]], %[[FREM_LOOP_BODY23]] ], [ [[NB21]], %[[FREM_COMPUTE15]] ] +; CHECK-NEXT: [[TMP51:%.*]] = sub i32 [[NB_EXIT_PHI35]], 26 +; CHECK-NEXT: [[TMP52:%.*]] = add i32 [[TMP51]], 1 +; CHECK-NEXT: [[AX36:%.*]] = call double @llvm.ldexp.f64.i32(double [[AX_EXIT_PHI34]], i32 [[TMP52]]) +; CHECK-NEXT: [[TMP53:%.*]] = fmul double [[AX36]], [[AYINV22]] +; CHECK-NEXT: [[Q37:%.*]] = call double @llvm.rint.f64(double [[TMP53]]) +; CHECK-NEXT: [[TMP54:%.*]] = fneg double [[Q37]] +; CHECK-NEXT: [[AX38:%.*]] = call double @llvm.fma.f64(double [[TMP54]], double [[AY20]], double [[AX36]]) +; CHECK-NEXT: [[CLT39:%.*]] = fcmp olt double [[AX38]], 0.000000e+00 +; CHECK-NEXT: [[AXP40:%.*]] = fadd double [[AX38]], [[AY20]] +; CHECK-NEXT: [[AX41:%.*]] = select i1 [[CLT39]], double [[AXP40]], double [[AX38]] +; CHECK-NEXT: [[AX42:%.*]] = call double @llvm.ldexp.f64.i32(double [[AX41]], i32 [[EY19]]) +; CHECK-NEXT: [[TMP55]] = call double @llvm.copysign.f64(double [[AX42]], double [[TMP11]]) +; CHECK-NEXT: br label %[[BB14]] +; + ptr addrspace(1) %in2) { + %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4 + %r0 = load <2 x double>, ptr addrspace(1) %in1, align 16 + %r1 = load <2 x double>, ptr addrspace(1) %gep2, align 16 + %r2 = frem <2 x double> %r0, %r1 + store <2 x double> %r2, ptr addrspace(1) %out, align 16 + ret void +} diff --git a/llvm/test/Transforms/ExpandFp/AMDGPU/lit.local.cfg b/llvm/test/Transforms/ExpandFp/AMDGPU/lit.local.cfg new file mode 100644 index 0000000000000..7c492428aec76 --- /dev/null +++ b/llvm/test/Transforms/ExpandFp/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not "AMDGPU" in config.root.targets: + config.unsupported = True