@@ -1867,6 +1867,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
18671867 LLT DstTy = MRI.getType (Dst);
18681868 LLT S16 = LLT::scalar (16 );
18691869 LLT S32 = LLT::scalar (32 );
1870+ LLT S64 = LLT::scalar (64 );
18701871
18711872 if (legalizeFastUnsafeFDIV (MI, MRI, B))
18721873 return true ;
@@ -1875,6 +1876,8 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
18751876 return legalizeFDIV16 (MI, MRI, B);
18761877 if (DstTy == S32)
18771878 return legalizeFDIV32 (MI, MRI, B);
1879+ if (DstTy == S64)
1880+ return legalizeFDIV64 (MI, MRI, B);
18781881
18791882 return false ;
18801883}
@@ -2072,6 +2075,88 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
20722075 return true ;
20732076}
20742077
2078+ bool AMDGPULegalizerInfo::legalizeFDIV64 (MachineInstr &MI,
2079+ MachineRegisterInfo &MRI,
2080+ MachineIRBuilder &B) const {
2081+ B.setInstr (MI);
2082+ Register Res = MI.getOperand (0 ).getReg ();
2083+ Register LHS = MI.getOperand (1 ).getReg ();
2084+ Register RHS = MI.getOperand (2 ).getReg ();
2085+
2086+ uint16_t Flags = MI.getFlags ();
2087+
2088+ LLT S64 = LLT::scalar (64 );
2089+ LLT S1 = LLT::scalar (1 );
2090+
2091+ auto One = B.buildFConstant (S64, 1.0 );
2092+
2093+ auto DivScale0 = B.buildIntrinsic (Intrinsic::amdgcn_div_scale, {S64, S1}, false )
2094+ .addUse (RHS)
2095+ .addUse (RHS)
2096+ .addUse (LHS)
2097+ .setMIFlags (Flags);
2098+
2099+ auto NegDivScale0 = B.buildFNeg (S64, DivScale0.getReg (0 ), Flags);
2100+
2101+ auto Rcp = B.buildIntrinsic (Intrinsic::amdgcn_rcp, {S64}, false )
2102+ .addUse (DivScale0.getReg (0 ))
2103+ .setMIFlags (Flags);
2104+
2105+ auto Fma0 = B.buildFMA (S64, NegDivScale0, Rcp, One, Flags);
2106+ auto Fma1 = B.buildFMA (S64, Rcp, Fma0, Rcp, Flags);
2107+ auto Fma2 = B.buildFMA (S64, NegDivScale0, Fma1, One, Flags);
2108+
2109+ auto DivScale1 = B.buildIntrinsic (Intrinsic::amdgcn_div_scale, {S64, S1}, false )
2110+ .addUse (LHS)
2111+ .addUse (RHS)
2112+ .addUse (LHS)
2113+ .setMIFlags (Flags);
2114+
2115+ auto Fma3 = B.buildFMA (S64, Fma1, Fma2, Fma1, Flags);
2116+ auto Mul = B.buildMul (S64, DivScale1.getReg (0 ), Fma3, Flags);
2117+ auto Fma4 = B.buildFMA (S64, NegDivScale0, Mul, DivScale1.getReg (0 ), Flags);
2118+
2119+ Register Scale;
2120+ if (!ST.hasUsableDivScaleConditionOutput ()) {
2121+ // Workaround a hardware bug on SI where the condition output from div_scale
2122+ // is not usable.
2123+
2124+ Scale = MRI.createGenericVirtualRegister (S1);
2125+
2126+ LLT S32 = LLT::scalar (32 );
2127+
2128+ auto NumUnmerge = B.buildUnmerge (S32, LHS);
2129+ auto DenUnmerge = B.buildUnmerge (S32, RHS);
2130+ auto Scale0Unmerge = B.buildUnmerge (S32, DivScale0);
2131+ auto Scale1Unmerge = B.buildUnmerge (S32, DivScale1);
2132+
2133+ auto CmpNum = B.buildICmp (ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg (1 ),
2134+ Scale1Unmerge.getReg (1 ));
2135+ auto CmpDen = B.buildICmp (ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg (1 ),
2136+ Scale0Unmerge.getReg (1 ));
2137+ B.buildXor (Scale, CmpNum, CmpDen);
2138+ } else {
2139+ Scale = DivScale1.getReg (1 );
2140+ }
2141+
2142+ auto Fmas = B.buildIntrinsic (Intrinsic::amdgcn_div_fmas, {S64}, false )
2143+ .addUse (Fma4.getReg (0 ))
2144+ .addUse (Fma3.getReg (0 ))
2145+ .addUse (Mul.getReg (0 ))
2146+ .addUse (Scale)
2147+ .setMIFlags (Flags);
2148+
2149+ B.buildIntrinsic (Intrinsic::amdgcn_div_fixup, {S64}, false )
2150+ .addDef (Res)
2151+ .addUse (Fmas.getReg (0 ))
2152+ .addUse (RHS)
2153+ .addUse (LHS)
2154+ .setMIFlags (Flags);
2155+
2156+ MI.eraseFromParent ();
2157+ return true ;
2158+ }
2159+
20752160bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin (MachineInstr &MI,
20762161 MachineRegisterInfo &MRI,
20772162 MachineIRBuilder &B) const {
0 commit comments