diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index f7deeafc9ccfc..b18db5644979a 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -386,6 +386,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); } + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); } // Set operations for 'LASX' feature. @@ -448,6 +449,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, VT, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); } + setOperationAction(ISD::FP_ROUND, MVT::v4f32, Custom); } // Set DAG combine for LA32 and LA64. @@ -466,8 +468,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, // Set DAG combine for 'LASX' feature. - if (Subtarget.hasExtLASX()) + if (Subtarget.hasExtLASX()) { setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::CONCAT_VECTORS); + } // Compute derived properties from the register classes. computeRegisterProperties(Subtarget.getRegisterInfo()); @@ -592,7 +596,101 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op, return lowerVECREDUCE(Op, DAG); case ISD::ConstantFP: return lowerConstantFP(Op, DAG); + case ISD::FP_ROUND: + return lowerFP_ROUND(Op, DAG); + } + return SDValue(); +} + +// Combine two ISD::FP_ROUND / LoongArchISD::VFCVT nodes with same type to +// LoongArchISD::VFCVT. For example: +// x1 = fp_round x, 0 +// y1 = fp_round y, 0 +// z = concat_vectors x1, y1 +// Or +// x1 = LoongArch::VFCVT undef, x +// y1 = LoongArch::VFCVT undef, y +// z = LoongArchISD::VPACKEV y1, x1 +// can be combined to: +// z = LoongArch::VFCVT y, x +static SDValue combineFP_ROUND(SDValue N, const SDLoc &DL, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { + assert(((N->getOpcode() == ISD::CONCAT_VECTORS && N->getNumOperands() == 2) || + (N->getOpcode() == LoongArchISD::VPACKEV)) && + "Invalid Node"); + + SDValue Op0 = peekThroughBitcasts(N->getOperand(0)); + SDValue Op1 = peekThroughBitcasts(N->getOperand(1)); + unsigned Opcode0 = Op0.getOpcode(); + unsigned Opcode1 = Op1.getOpcode(); + if (Opcode0 != Opcode1) + return SDValue(); + + if (Opcode0 != ISD::FP_ROUND && Opcode0 != LoongArchISD::VFCVT) + return SDValue(); + + // Check if two nodes have only one use. + if (!Op0.hasOneUse() || !Op1.hasOneUse()) + return SDValue(); + + EVT VT = N.getValueType(); + EVT SVT0 = Op0.getValueType(); + EVT SVT1 = Op1.getValueType(); + // Check if two nodes have the same result type. + if (SVT0 != SVT1) + return SDValue(); + + // Check if two nodes have the same operand type. + EVT SSVT0 = Op0.getOperand(0).getValueType(); + EVT SSVT1 = Op1.getOperand(0).getValueType(); + if (SSVT0 != SSVT1) + return SDValue(); + + if (N->getOpcode() == ISD::CONCAT_VECTORS && Opcode0 == ISD::FP_ROUND) { + if (Subtarget.hasExtLASX() && VT.is256BitVector() && SVT0 == MVT::v4f32 && + SSVT0 == MVT::v4f64) { + // A vector_shuffle is required in the final step, as xvfcvt instruction + // operates on each 128-bit segament as a lane. + SDValue Res = DAG.getNode(LoongArchISD::VFCVT, DL, MVT::v8f32, + Op1.getOperand(0), Op0.getOperand(0)); + SDValue Undef = DAG.getUNDEF(VT); + SmallVector Mask = {0, 1, 4, 5, 2, 3, 6, 7}; + Res = DAG.getVectorShuffle(VT, DL, Res, Undef, Mask); + return DAG.getBitcast(VT, Res); + } } + + if (N->getOpcode() == LoongArchISD::VPACKEV && + Opcode0 == LoongArchISD::VFCVT) { + // For VPACKEV, check if the first operation of LoongArchISD::VFCVT is + // undef. + if (!Op0.getOperand(0).isUndef() || !Op1.getOperand(0).isUndef()) + return SDValue(); + + if (Subtarget.hasExtLSX() && (VT == MVT::v2i64 || VT == MVT::v2f64) && + SVT0 == MVT::v4f32 && SSVT0 == MVT::v2f64) { + SDValue Res = DAG.getNode(LoongArchISD::VFCVT, DL, MVT::v4f32, + Op0.getOperand(1), Op1.getOperand(1)); + return DAG.getBitcast(VT, Res); + } + } + + return SDValue(); +} + +SDValue LoongArchTargetLowering::lowerFP_ROUND(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue In = Op.getOperand(0); + MVT VT = Op.getSimpleValueType(); + MVT SVT = In.getSimpleValueType(); + + if (VT == MVT::v4f32 && SVT == MVT::v4f64) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(In, DL); + return DAG.getNode(LoongArchISD::VFCVT, DL, VT, Hi, Lo); + } + return SDValue(); } @@ -4720,6 +4818,21 @@ void LoongArchTargetLowering::ReplaceNodeResults( Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Tmp1)); break; } + case ISD::FP_ROUND: { + assert(VT == MVT::v2f32 && Subtarget.hasExtLSX() && + "Unexpected custom legalisation"); + // On LSX platforms, rounding from v2f64 to v4f32 (after legalization from + // v2f32) is scalarized. Add a customized v2f32 widening to convert it into + // a target-specific LoongArchISD::VFCVT to optimize it. + if (VT == MVT::v2f32) { + SDValue Src = N->getOperand(0); + SDValue Undef = DAG.getUNDEF(Src.getValueType()); + SDValue Dst = + DAG.getNode(LoongArchISD::VFCVT, DL, MVT::v4f32, Undef, Src); + Results.push_back(Dst); + } + break; + } case ISD::BSWAP: { SDValue Src = N->getOperand(0); assert((VT == MVT::i16 || VT == MVT::i32) && @@ -6679,6 +6792,20 @@ performEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue +performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const LoongArchSubtarget &Subtarget) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + if (VT.isVector() && N->getNumOperands() == 2) + if (SDValue R = combineFP_ROUND(SDValue(N, 0), DL, DAG, Subtarget)) + return R; + + return SDValue(); +} + SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -6714,6 +6841,12 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, return performSPLIT_PAIR_F64Combine(N, DAG, DCI, Subtarget); case ISD::EXTRACT_VECTOR_ELT: return performEXTRACT_VECTOR_ELTCombine(N, DAG, DCI, Subtarget); + case ISD::CONCAT_VECTORS: + return performCONCAT_VECTORSCombine(N, DAG, DCI, Subtarget); + case LoongArchISD::VPACKEV: + if (SDValue Result = + combineFP_ROUND(SDValue(N, 0), SDLoc(N), DAG, Subtarget)) + return Result; } return SDValue(); } @@ -7512,6 +7645,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VANY_NONZERO) NODE_NAME_CASE(FRECIPE) NODE_NAME_CASE(FRSQRTE) + NODE_NAME_CASE(VFCVT) NODE_NAME_CASE(VSLLI) NODE_NAME_CASE(VSRLI) NODE_NAME_CASE(VBSLL) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 8a4d7748467c7..b69e6240cab61 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -167,6 +167,8 @@ enum NodeType : unsigned { FRECIPE, FRSQRTE, + VFCVT, + // Vector logicial left / right shift by immediate VSLLI, VSRLI, @@ -415,6 +417,7 @@ class LoongArchTargetLowering : public TargetLowering { SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 5143d53bad719..0a170de40f834 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -2403,6 +2403,10 @@ def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm), def : Pat<(int_loongarch_lasx_xvpickve_d_f v4f64:$xj, timm:$imm), (XVPICKVE_D v4f64:$xj, (to_valid_timm timm:$imm))>; +// Vector floating-point conversion +def : Pat<(v8f32 (loongarch_vfcvt_s_d (v4f64 LASX256:$xj), (v4f64 LASX256:$xk))), + (XVFCVT_S_D LASX256:$xj, LASX256:$xk)>; + // load def : Pat<(int_loongarch_lasx_xvld GPR:$rj, timm:$imm), (XVLD GPR:$rj, (to_valid_timm timm:$imm))>; diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 8d1dc99e316c9..844d391b49c3f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -32,6 +32,8 @@ def SDT_LoongArchVFRECIPE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, SDTC def SDT_LoongArchVFRSQRTE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0, 1>]>; def SDT_LoongArchVLDREPL : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisPtrTy<1>]>; def SDT_LoongArchVMSKCOND : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<1>]>; +def SDT_LoongArchVFCVT_S_D : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisFP<0>, + SDTCisVec<1>, SDTCisFP<1>, SDTCisSameAs<1, 2>]>; // Target nodes. def loongarch_vreplve : SDNode<"LoongArchISD::VREPLVE", SDT_LoongArchVreplve>; @@ -82,6 +84,8 @@ def loongarch_vmskgez: SDNode<"LoongArchISD::VMSKGEZ", SDT_LoongArchVMSKCOND>; def loongarch_vmskeqz: SDNode<"LoongArchISD::VMSKEQZ", SDT_LoongArchVMSKCOND>; def loongarch_vmsknez: SDNode<"LoongArchISD::VMSKNEZ", SDT_LoongArchVMSKCOND>; +def loongarch_vfcvt_s_d: SDNode<"LoongArchISD::VFCVT", SDT_LoongArchVFCVT_S_D>; + def immZExt1 : ImmLeaf(Imm);}]>; def immZExt2 : ImmLeaf(Imm);}]>; def immZExt3 : ImmLeaf(Imm);}]>; @@ -2519,6 +2523,9 @@ def : Pat<(f64 (froundeven FPR64:$fj)), (f64 (EXTRACT_SUBREG (VFRINTRNE_D (VREPLVEI_D (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), 0)), sub_64))>; +def : Pat<(v4f32 (loongarch_vfcvt_s_d (v2f64 LSX128:$vj), (v2f64 LSX128:$vk))), + (VFCVT_S_D LSX128:$vj, LSX128:$vk)>; + // load def : Pat<(int_loongarch_lsx_vld GPR:$rj, timm:$imm), (VLD GPR:$rj, (to_valid_timm timm:$imm))>; diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptrunc.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptrunc.ll index 6ade53d9ef531..7c6345b7c0057 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptrunc.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptrunc.ll @@ -7,18 +7,9 @@ define void @fptrunc_v4f64_to_v4f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: fptrunc_v4f64_to_v4f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1 -; CHECK-NEXT: fcvt.s.d $fa1, $fa1 -; CHECK-NEXT: xvpickve.d $xr2, $xr0, 0 -; CHECK-NEXT: fcvt.s.d $fa2, $fa2 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 2 -; CHECK-NEXT: fcvt.s.d $fa1, $fa1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 3 -; CHECK-NEXT: fcvt.s.d $fa0, $fa0 -; CHECK-NEXT: vextrins.w $vr2, $vr0, 48 -; CHECK-NEXT: vst $vr2, $a0, 0 +; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 +; CHECK-NEXT: vfcvt.s.d $vr0, $vr1, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <4 x double>, ptr %a0 @@ -30,32 +21,13 @@ entry: define void @fptrunc_v8f64_to_v8f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: fptrunc_v8f64_to_v8f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a1, 32 -; CHECK-NEXT: xvld $xr1, $a1, 0 -; CHECK-NEXT: xvpickve.d $xr2, $xr0, 1 -; CHECK-NEXT: fcvt.s.d $fa2, $fa2 -; CHECK-NEXT: xvpickve.d $xr3, $xr0, 0 -; CHECK-NEXT: fcvt.s.d $fa3, $fa3 -; CHECK-NEXT: vextrins.w $vr3, $vr2, 16 -; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2 -; CHECK-NEXT: fcvt.s.d $fa2, $fa2 -; CHECK-NEXT: vextrins.w $vr3, $vr2, 32 -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 3 -; CHECK-NEXT: fcvt.s.d $fa0, $fa0 -; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 -; CHECK-NEXT: xvpickve.d $xr0, $xr1, 1 -; CHECK-NEXT: fcvt.s.d $fa0, $fa0 -; CHECK-NEXT: xvpickve.d $xr2, $xr1, 0 -; CHECK-NEXT: fcvt.s.d $fa2, $fa2 -; CHECK-NEXT: vextrins.w $vr2, $vr0, 16 -; CHECK-NEXT: xvpickve.d $xr0, $xr1, 2 -; CHECK-NEXT: fcvt.s.d $fa0, $fa0 -; CHECK-NEXT: vextrins.w $vr2, $vr0, 32 -; CHECK-NEXT: xvpickve.d $xr0, $xr1, 3 -; CHECK-NEXT: fcvt.s.d $fa0, $fa0 -; CHECK-NEXT: vextrins.w $vr2, $vr0, 48 -; CHECK-NEXT: xvpermi.q $xr2, $xr3, 2 -; CHECK-NEXT: xvst $xr2, $a0, 0 +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a1, 32 +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI1_0) +; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI1_0) +; CHECK-NEXT: xvfcvt.s.d $xr0, $xr1, $xr0 +; CHECK-NEXT: xvperm.w $xr0, $xr0, $xr2 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <8 x double>, ptr %a0 diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptrunc.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptrunc.ll index acd487a889c4e..e4f2f0906743d 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptrunc.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptrunc.ll @@ -21,22 +21,15 @@ define void @fptrunc_v2f64_to_v2f32(ptr %res, ptr %a0) nounwind { ; LA32-LABEL: fptrunc_v2f64_to_v2f32: ; LA32: # %bb.0: # %entry ; LA32-NEXT: vld $vr0, $a1, 0 -; LA32-NEXT: vreplvei.d $vr1, $vr0, 0 -; LA32-NEXT: fcvt.s.d $fa1, $fa1 -; LA32-NEXT: vreplvei.d $vr0, $vr0, 1 -; LA32-NEXT: fcvt.s.d $fa0, $fa0 -; LA32-NEXT: fst.s $fa0, $a0, 4 -; LA32-NEXT: fst.s $fa1, $a0, 0 +; LA32-NEXT: vfcvt.s.d $vr0, $vr0, $vr0 +; LA32-NEXT: vstelm.w $vr0, $a0, 4, 1 +; LA32-NEXT: vstelm.w $vr0, $a0, 0, 0 ; LA32-NEXT: ret ; ; LA64-LABEL: fptrunc_v2f64_to_v2f32: ; LA64: # %bb.0: # %entry ; LA64-NEXT: vld $vr0, $a1, 0 -; LA64-NEXT: vreplvei.d $vr1, $vr0, 1 -; LA64-NEXT: fcvt.s.d $fa1, $fa1 -; LA64-NEXT: vreplvei.d $vr0, $vr0, 0 -; LA64-NEXT: fcvt.s.d $fa0, $fa0 -; LA64-NEXT: vextrins.w $vr0, $vr1, 16 +; LA64-NEXT: vfcvt.s.d $vr0, $vr0, $vr0 ; LA64-NEXT: vstelm.d $vr0, $a0, 0, 0 ; LA64-NEXT: ret entry: @@ -51,17 +44,7 @@ define void @fptrunc_v4f64_to_v4f32(ptr %res, ptr %a0) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a1, 16 -; CHECK-NEXT: vreplvei.d $vr2, $vr0, 1 -; CHECK-NEXT: fcvt.s.d $fa2, $fa2 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: fcvt.s.d $fa0, $fa0 -; CHECK-NEXT: vextrins.w $vr0, $vr2, 16 -; CHECK-NEXT: vreplvei.d $vr2, $vr1, 0 -; CHECK-NEXT: fcvt.s.d $fa2, $fa2 -; CHECK-NEXT: vextrins.w $vr0, $vr2, 32 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 1 -; CHECK-NEXT: fcvt.s.d $fa1, $fa1 -; CHECK-NEXT: vextrins.w $vr0, $vr1, 48 +; CHECK-NEXT: vfcvt.s.d $vr0, $vr1, $vr0 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: