Skip to content

Commit f3225f2

Browse files
committed
AMDGPU/GlobalISel: Legalize FDIV64
Reviewers: arsenm Reviewed By: arsenm Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D70403
1 parent 47feae5 commit f3225f2

File tree

3 files changed

+555
-41
lines changed

3 files changed

+555
-41
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1867,6 +1867,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
18671867
LLT DstTy = MRI.getType(Dst);
18681868
LLT S16 = LLT::scalar(16);
18691869
LLT S32 = LLT::scalar(32);
1870+
LLT S64 = LLT::scalar(64);
18701871

18711872
if (legalizeFastUnsafeFDIV(MI, MRI, B))
18721873
return true;
@@ -1875,6 +1876,8 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
18751876
return legalizeFDIV16(MI, MRI, B);
18761877
if (DstTy == S32)
18771878
return legalizeFDIV32(MI, MRI, B);
1879+
if (DstTy == S64)
1880+
return legalizeFDIV64(MI, MRI, B);
18781881

18791882
return false;
18801883
}
@@ -2072,6 +2075,88 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
20722075
return true;
20732076
}
20742077

2078+
bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2079+
MachineRegisterInfo &MRI,
2080+
MachineIRBuilder &B) const {
2081+
B.setInstr(MI);
2082+
Register Res = MI.getOperand(0).getReg();
2083+
Register LHS = MI.getOperand(1).getReg();
2084+
Register RHS = MI.getOperand(2).getReg();
2085+
2086+
uint16_t Flags = MI.getFlags();
2087+
2088+
LLT S64 = LLT::scalar(64);
2089+
LLT S1 = LLT::scalar(1);
2090+
2091+
auto One = B.buildFConstant(S64, 1.0);
2092+
2093+
auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2094+
.addUse(RHS)
2095+
.addUse(RHS)
2096+
.addUse(LHS)
2097+
.setMIFlags(Flags);
2098+
2099+
auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2100+
2101+
auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2102+
.addUse(DivScale0.getReg(0))
2103+
.setMIFlags(Flags);
2104+
2105+
auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2106+
auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2107+
auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2108+
2109+
auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2110+
.addUse(LHS)
2111+
.addUse(RHS)
2112+
.addUse(LHS)
2113+
.setMIFlags(Flags);
2114+
2115+
auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2116+
auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2117+
auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2118+
2119+
Register Scale;
2120+
if (!ST.hasUsableDivScaleConditionOutput()) {
2121+
// Workaround a hardware bug on SI where the condition output from div_scale
2122+
// is not usable.
2123+
2124+
Scale = MRI.createGenericVirtualRegister(S1);
2125+
2126+
LLT S32 = LLT::scalar(32);
2127+
2128+
auto NumUnmerge = B.buildUnmerge(S32, LHS);
2129+
auto DenUnmerge = B.buildUnmerge(S32, RHS);
2130+
auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2131+
auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2132+
2133+
auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2134+
Scale1Unmerge.getReg(1));
2135+
auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2136+
Scale0Unmerge.getReg(1));
2137+
B.buildXor(Scale, CmpNum, CmpDen);
2138+
} else {
2139+
Scale = DivScale1.getReg(1);
2140+
}
2141+
2142+
auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2143+
.addUse(Fma4.getReg(0))
2144+
.addUse(Fma3.getReg(0))
2145+
.addUse(Mul.getReg(0))
2146+
.addUse(Scale)
2147+
.setMIFlags(Flags);
2148+
2149+
B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, {S64}, false)
2150+
.addDef(Res)
2151+
.addUse(Fmas.getReg(0))
2152+
.addUse(RHS)
2153+
.addUse(LHS)
2154+
.setMIFlags(Flags);
2155+
2156+
MI.eraseFromParent();
2157+
return true;
2158+
}
2159+
20752160
bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
20762161
MachineRegisterInfo &MRI,
20772162
MachineIRBuilder &B) const {

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ class AMDGPULegalizerInfo : public LegalizerInfo {
9090
MachineIRBuilder &B) const;
9191
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI,
9292
MachineIRBuilder &B) const;
93+
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI,
94+
MachineIRBuilder &B) const;
9395
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
9496
MachineIRBuilder &B) const;
9597
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,

0 commit comments

Comments
 (0)