From fde129a5b919c922dbc00d055e02c6dadaa24e75 Mon Sep 17 00:00:00 2001 From: sun-jacobi Date: Sun, 14 Jan 2024 17:47:16 +0900 Subject: [PATCH] [RISCV][Isel] Combine scalable vector add/sub/mul with zero/sign extension. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 238 ++++++-- llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll | 128 ++-- .../RISCV/rvv/vscale-vw-web-simplification.ll | 549 ++++++++++++++++-- 3 files changed, 760 insertions(+), 155 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index cb9ffabc41236..129b98ff485b8 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1374,8 +1374,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setPrefLoopAlignment(Subtarget.getPrefLoopAlignment()); setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN, - ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::AND, - ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT}); + ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::MUL, + ISD::AND, ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT}); if (Subtarget.is64Bit()) setTargetDAGCombine(ISD::SRA); @@ -12863,9 +12863,9 @@ struct CombineResult; /// Helper class for folding sign/zero extensions. /// In particular, this class is used for the following combines: -/// add_vl -> vwadd(u) | vwadd(u)_w -/// sub_vl -> vwsub(u) | vwsub(u)_w -/// mul_vl -> vwmul(u) | vwmul_su +/// add | add_vl -> vwadd(u) | vwadd(u)_w +/// sub | sub_vl -> vwsub(u) | vwsub(u)_w +/// mul | mul_vl -> vwmul(u) | vwmul_su /// /// An object of this class represents an operand of the operation we want to /// combine. @@ -12910,6 +12910,8 @@ struct NodeExtensionHelper { /// E.g., for zext(a), this would return a. SDValue getSource() const { switch (OrigOperand.getOpcode()) { + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: case RISCVISD::VSEXT_VL: case RISCVISD::VZEXT_VL: return OrigOperand.getOperand(0); @@ -12926,7 +12928,8 @@ struct NodeExtensionHelper { /// Get or create a value that can feed \p Root with the given extension \p /// SExt. If \p SExt is std::nullopt, this returns the source of this operand. /// \see ::getSource(). - SDValue getOrCreateExtendedOp(const SDNode *Root, SelectionDAG &DAG, + SDValue getOrCreateExtendedOp(SDNode *Root, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget, std::optional SExt) const { if (!SExt.has_value()) return OrigOperand; @@ -12941,8 +12944,10 @@ struct NodeExtensionHelper { // If we need an extension, we should be changing the type. SDLoc DL(Root); - auto [Mask, VL] = getMaskAndVL(Root); + auto [Mask, VL] = getMaskAndVL(Root, DAG, Subtarget); switch (OrigOperand.getOpcode()) { + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: case RISCVISD::VSEXT_VL: case RISCVISD::VZEXT_VL: return DAG.getNode(ExtOpc, DL, NarrowVT, Source, Mask, VL); @@ -12982,12 +12987,15 @@ struct NodeExtensionHelper { /// \pre \p Opcode represents a supported root (\see ::isSupportedRoot()). static unsigned getSameExtensionOpcode(unsigned Opcode, bool IsSExt) { switch (Opcode) { + case ISD::ADD: case RISCVISD::ADD_VL: case RISCVISD::VWADD_W_VL: case RISCVISD::VWADDU_W_VL: return IsSExt ? RISCVISD::VWADD_VL : RISCVISD::VWADDU_VL; + case ISD::MUL: case RISCVISD::MUL_VL: return IsSExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL; + case ISD::SUB: case RISCVISD::SUB_VL: case RISCVISD::VWSUB_W_VL: case RISCVISD::VWSUBU_W_VL: @@ -13000,7 +13008,8 @@ struct NodeExtensionHelper { /// Get the opcode to materialize \p Opcode(sext(a), zext(b)) -> /// newOpcode(a, b). static unsigned getSUOpcode(unsigned Opcode) { - assert(Opcode == RISCVISD::MUL_VL && "SU is only supported for MUL"); + assert((Opcode == RISCVISD::MUL_VL || Opcode == ISD::MUL) && + "SU is only supported for MUL"); return RISCVISD::VWMULSU_VL; } @@ -13008,8 +13017,10 @@ struct NodeExtensionHelper { /// newOpcode(a, b). static unsigned getWOpcode(unsigned Opcode, bool IsSExt) { switch (Opcode) { + case ISD::ADD: case RISCVISD::ADD_VL: return IsSExt ? RISCVISD::VWADD_W_VL : RISCVISD::VWADDU_W_VL; + case ISD::SUB: case RISCVISD::SUB_VL: return IsSExt ? RISCVISD::VWSUB_W_VL : RISCVISD::VWSUBU_W_VL; default: @@ -13019,19 +13030,49 @@ struct NodeExtensionHelper { using CombineToTry = std::function( SDNode * /*Root*/, const NodeExtensionHelper & /*LHS*/, - const NodeExtensionHelper & /*RHS*/)>; + const NodeExtensionHelper & /*RHS*/, SelectionDAG &, + const RISCVSubtarget &)>; /// Check if this node needs to be fully folded or extended for all users. bool needToPromoteOtherUsers() const { return EnforceOneUse; } /// Helper method to set the various fields of this struct based on the /// type of \p Root. - void fillUpExtensionSupport(SDNode *Root, SelectionDAG &DAG) { + void fillUpExtensionSupport(SDNode *Root, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { SupportsZExt = false; SupportsSExt = false; EnforceOneUse = true; CheckMask = true; - switch (OrigOperand.getOpcode()) { + unsigned Opc = OrigOperand.getOpcode(); + switch (Opc) { + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: { + MVT VT = OrigOperand.getSimpleValueType(); + if (!VT.isVector()) + break; + + SDValue NarrowElt = OrigOperand.getOperand(0); + MVT NarrowVT = NarrowElt.getSimpleValueType(); + + unsigned ScalarBits = VT.getScalarSizeInBits(); + unsigned NarrowScalarBits = NarrowVT.getScalarSizeInBits(); + + // Ensure the narrowing element type is legal + if (!Subtarget.getTargetLowering()->isTypeLegal(NarrowElt.getValueType())) + break; + + // Ensure the extension's semantic is equivalent to rvv vzext or vsext. + if (ScalarBits != NarrowScalarBits * 2) + break; + + SupportsZExt = Opc == ISD::ZERO_EXTEND; + SupportsSExt = Opc == ISD::SIGN_EXTEND; + + SDLoc DL(Root); + std::tie(Mask, VL) = getDefaultScalableVLOps(VT, DL, DAG, Subtarget); + break; + } case RISCVISD::VZEXT_VL: SupportsZExt = true; Mask = OrigOperand.getOperand(1); @@ -13087,8 +13128,16 @@ struct NodeExtensionHelper { } /// Check if \p Root supports any extension folding combines. - static bool isSupportedRoot(const SDNode *Root) { + static bool isSupportedRoot(const SDNode *Root, const SelectionDAG &DAG) { switch (Root->getOpcode()) { + case ISD::ADD: + case ISD::SUB: + case ISD::MUL: { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(Root->getValueType(0))) + return false; + return Root->getValueType(0).isScalableVector(); + } case RISCVISD::ADD_VL: case RISCVISD::MUL_VL: case RISCVISD::VWADD_W_VL: @@ -13103,9 +13152,10 @@ struct NodeExtensionHelper { } /// Build a NodeExtensionHelper for \p Root.getOperand(\p OperandIdx). - NodeExtensionHelper(SDNode *Root, unsigned OperandIdx, SelectionDAG &DAG) { - assert(isSupportedRoot(Root) && "Trying to build an helper with an " - "unsupported root"); + NodeExtensionHelper(SDNode *Root, unsigned OperandIdx, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + assert(isSupportedRoot(Root, DAG) && "Trying to build an helper with an " + "unsupported root"); assert(OperandIdx < 2 && "Requesting something else than LHS or RHS"); OrigOperand = Root->getOperand(OperandIdx); @@ -13121,7 +13171,7 @@ struct NodeExtensionHelper { SupportsZExt = Opc == RISCVISD::VWADDU_W_VL || Opc == RISCVISD::VWSUBU_W_VL; SupportsSExt = !SupportsZExt; - std::tie(Mask, VL) = getMaskAndVL(Root); + std::tie(Mask, VL) = getMaskAndVL(Root, DAG, Subtarget); CheckMask = true; // There's no existing extension here, so we don't have to worry about // making sure it gets removed. @@ -13130,7 +13180,7 @@ struct NodeExtensionHelper { } [[fallthrough]]; default: - fillUpExtensionSupport(Root, DAG); + fillUpExtensionSupport(Root, DAG, Subtarget); break; } } @@ -13146,14 +13196,27 @@ struct NodeExtensionHelper { } /// Helper function to get the Mask and VL from \p Root. - static std::pair getMaskAndVL(const SDNode *Root) { - assert(isSupportedRoot(Root) && "Unexpected root"); - return std::make_pair(Root->getOperand(3), Root->getOperand(4)); + static std::pair + getMaskAndVL(const SDNode *Root, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + assert(isSupportedRoot(Root, DAG) && "Unexpected root"); + switch (Root->getOpcode()) { + case ISD::ADD: + case ISD::SUB: + case ISD::MUL: { + SDLoc DL(Root); + MVT VT = Root->getSimpleValueType(0); + return getDefaultScalableVLOps(VT, DL, DAG, Subtarget); + } + default: + return std::make_pair(Root->getOperand(3), Root->getOperand(4)); + } } /// Check if the Mask and VL of this operand are compatible with \p Root. - bool areVLAndMaskCompatible(const SDNode *Root) const { - auto [Mask, VL] = getMaskAndVL(Root); + bool areVLAndMaskCompatible(SDNode *Root, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) const { + auto [Mask, VL] = getMaskAndVL(Root, DAG, Subtarget); return isMaskCompatible(Mask) && isVLCompatible(VL); } @@ -13161,11 +13224,14 @@ struct NodeExtensionHelper { /// foldings that are supported by this class. static bool isCommutative(const SDNode *N) { switch (N->getOpcode()) { + case ISD::ADD: + case ISD::MUL: case RISCVISD::ADD_VL: case RISCVISD::MUL_VL: case RISCVISD::VWADD_W_VL: case RISCVISD::VWADDU_W_VL: return true; + case ISD::SUB: case RISCVISD::SUB_VL: case RISCVISD::VWSUB_W_VL: case RISCVISD::VWSUBU_W_VL: @@ -13210,14 +13276,25 @@ struct CombineResult { /// Return a value that uses TargetOpcode and that can be used to replace /// Root. /// The actual replacement is *not* done in that method. - SDValue materialize(SelectionDAG &DAG) const { + SDValue materialize(SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) const { SDValue Mask, VL, Merge; - std::tie(Mask, VL) = NodeExtensionHelper::getMaskAndVL(Root); - Merge = Root->getOperand(2); + std::tie(Mask, VL) = + NodeExtensionHelper::getMaskAndVL(Root, DAG, Subtarget); + switch (Root->getOpcode()) { + default: + Merge = Root->getOperand(2); + break; + case ISD::ADD: + case ISD::SUB: + case ISD::MUL: + Merge = DAG.getUNDEF(Root->getValueType(0)); + break; + } return DAG.getNode(TargetOpcode, SDLoc(Root), Root->getValueType(0), - LHS.getOrCreateExtendedOp(Root, DAG, SExtLHS), - RHS.getOrCreateExtendedOp(Root, DAG, SExtRHS), Merge, - Mask, VL); + LHS.getOrCreateExtendedOp(Root, DAG, Subtarget, SExtLHS), + RHS.getOrCreateExtendedOp(Root, DAG, Subtarget, SExtRHS), + Merge, Mask, VL); } }; @@ -13234,15 +13311,16 @@ struct CombineResult { static std::optional canFoldToVWWithSameExtensionImpl(SDNode *Root, const NodeExtensionHelper &LHS, const NodeExtensionHelper &RHS, bool AllowSExt, - bool AllowZExt) { + bool AllowZExt, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { assert((AllowSExt || AllowZExt) && "Forgot to set what you want?"); - if (!LHS.areVLAndMaskCompatible(Root) || !RHS.areVLAndMaskCompatible(Root)) + if (!LHS.areVLAndMaskCompatible(Root, DAG, Subtarget) || + !RHS.areVLAndMaskCompatible(Root, DAG, Subtarget)) return std::nullopt; if (AllowZExt && LHS.SupportsZExt && RHS.SupportsZExt) return CombineResult(NodeExtensionHelper::getSameExtensionOpcode( Root->getOpcode(), /*IsSExt=*/false), - Root, LHS, /*SExtLHS=*/false, RHS, - /*SExtRHS=*/false); + Root, LHS, /*SExtLHS=*/false, RHS, /*SExtRHS=*/false); if (AllowSExt && LHS.SupportsSExt && RHS.SupportsSExt) return CombineResult(NodeExtensionHelper::getSameExtensionOpcode( Root->getOpcode(), /*IsSExt=*/true), @@ -13259,9 +13337,10 @@ canFoldToVWWithSameExtensionImpl(SDNode *Root, const NodeExtensionHelper &LHS, /// can be used to apply the pattern. static std::optional canFoldToVWWithSameExtension(SDNode *Root, const NodeExtensionHelper &LHS, - const NodeExtensionHelper &RHS) { + const NodeExtensionHelper &RHS, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/true, - /*AllowZExt=*/true); + /*AllowZExt=*/true, DAG, Subtarget); } /// Check if \p Root follows a pattern Root(LHS, ext(RHS)) @@ -13270,8 +13349,9 @@ canFoldToVWWithSameExtension(SDNode *Root, const NodeExtensionHelper &LHS, /// can be used to apply the pattern. static std::optional canFoldToVW_W(SDNode *Root, const NodeExtensionHelper &LHS, - const NodeExtensionHelper &RHS) { - if (!RHS.areVLAndMaskCompatible(Root)) + const NodeExtensionHelper &RHS, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + if (!RHS.areVLAndMaskCompatible(Root, DAG, Subtarget)) return std::nullopt; // FIXME: Is it useful to form a vwadd.wx or vwsub.wx if it removes a scalar @@ -13295,9 +13375,10 @@ canFoldToVW_W(SDNode *Root, const NodeExtensionHelper &LHS, /// can be used to apply the pattern. static std::optional canFoldToVWWithSEXT(SDNode *Root, const NodeExtensionHelper &LHS, - const NodeExtensionHelper &RHS) { + const NodeExtensionHelper &RHS, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/true, - /*AllowZExt=*/false); + /*AllowZExt=*/false, DAG, Subtarget); } /// Check if \p Root follows a pattern Root(zext(LHS), zext(RHS)) @@ -13306,9 +13387,10 @@ canFoldToVWWithSEXT(SDNode *Root, const NodeExtensionHelper &LHS, /// can be used to apply the pattern. static std::optional canFoldToVWWithZEXT(SDNode *Root, const NodeExtensionHelper &LHS, - const NodeExtensionHelper &RHS) { + const NodeExtensionHelper &RHS, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/false, - /*AllowZExt=*/true); + /*AllowZExt=*/true, DAG, Subtarget); } /// Check if \p Root follows a pattern Root(sext(LHS), zext(RHS)) @@ -13317,10 +13399,13 @@ canFoldToVWWithZEXT(SDNode *Root, const NodeExtensionHelper &LHS, /// can be used to apply the pattern. static std::optional canFoldToVW_SU(SDNode *Root, const NodeExtensionHelper &LHS, - const NodeExtensionHelper &RHS) { + const NodeExtensionHelper &RHS, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + if (!LHS.SupportsSExt || !RHS.SupportsZExt) return std::nullopt; - if (!LHS.areVLAndMaskCompatible(Root) || !RHS.areVLAndMaskCompatible(Root)) + if (!LHS.areVLAndMaskCompatible(Root, DAG, Subtarget) || + !RHS.areVLAndMaskCompatible(Root, DAG, Subtarget)) return std::nullopt; return CombineResult(NodeExtensionHelper::getSUOpcode(Root->getOpcode()), Root, LHS, /*SExtLHS=*/true, RHS, /*SExtRHS=*/false); @@ -13330,6 +13415,8 @@ SmallVector NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) { SmallVector Strategies; switch (Root->getOpcode()) { + case ISD::ADD: + case ISD::SUB: case RISCVISD::ADD_VL: case RISCVISD::SUB_VL: // add|sub -> vwadd(u)|vwsub(u) @@ -13337,6 +13424,7 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) { // add|sub -> vwadd(u)_w|vwsub(u)_w Strategies.push_back(canFoldToVW_W); break; + case ISD::MUL: case RISCVISD::MUL_VL: // mul -> vwmul(u) Strategies.push_back(canFoldToVWWithSameExtension); @@ -13367,12 +13455,14 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) { /// mul_vl -> vwmul(u) | vwmul_su /// vwadd_w(u) -> vwadd(u) /// vwub_w(u) -> vwadd(u) -static SDValue -combineBinOp_VLToVWBinOp_VL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { +static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const RISCVSubtarget &Subtarget) { SelectionDAG &DAG = DCI.DAG; - assert(NodeExtensionHelper::isSupportedRoot(N) && - "Shouldn't have called this method"); + if (!NodeExtensionHelper::isSupportedRoot(N, DAG)) + return SDValue(); + SmallVector Worklist; SmallSet Inserted; Worklist.push_back(N); @@ -13381,11 +13471,11 @@ combineBinOp_VLToVWBinOp_VL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { while (!Worklist.empty()) { SDNode *Root = Worklist.pop_back_val(); - if (!NodeExtensionHelper::isSupportedRoot(Root)) + if (!NodeExtensionHelper::isSupportedRoot(Root, DAG)) return SDValue(); - NodeExtensionHelper LHS(N, 0, DAG); - NodeExtensionHelper RHS(N, 1, DAG); + NodeExtensionHelper LHS(N, 0, DAG, Subtarget); + NodeExtensionHelper RHS(N, 1, DAG, Subtarget); auto AppendUsersIfNeeded = [&Worklist, &Inserted](const NodeExtensionHelper &Op) { if (Op.needToPromoteOtherUsers()) { @@ -13412,7 +13502,8 @@ combineBinOp_VLToVWBinOp_VL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { for (NodeExtensionHelper::CombineToTry FoldingStrategy : FoldingStrategies) { - std::optional Res = FoldingStrategy(N, LHS, RHS); + std::optional Res = + FoldingStrategy(N, LHS, RHS, DAG, Subtarget); if (Res) { Matched = true; CombinesToApply.push_back(*Res); @@ -13441,7 +13532,7 @@ combineBinOp_VLToVWBinOp_VL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SmallVector> ValuesToReplace; ValuesToReplace.reserve(CombinesToApply.size()); for (CombineResult Res : CombinesToApply) { - SDValue NewValue = Res.materialize(DAG); + SDValue NewValue = Res.materialize(DAG, Subtarget); if (!InputRootReplacement) { assert(Res.Root == N && "First element is expected to be the current node"); @@ -14713,13 +14804,20 @@ static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG, static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { - assert(N->getOpcode() == RISCVISD::ADD_VL); + + assert(N->getOpcode() == RISCVISD::ADD_VL || N->getOpcode() == ISD::ADD); + + if (N->getValueType(0).isFixedLengthVector()) + return SDValue(); + SDValue Addend = N->getOperand(0); SDValue MulOp = N->getOperand(1); - SDValue AddMergeOp = N->getOperand(2); - if (!AddMergeOp.isUndef()) - return SDValue(); + if (N->getOpcode() == RISCVISD::ADD_VL) { + SDValue AddMergeOp = N->getOperand(2); + if (!AddMergeOp.isUndef()) + return SDValue(); + } auto IsVWMulOpc = [](unsigned Opc) { switch (Opc) { @@ -14743,8 +14841,16 @@ static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG, if (!MulMergeOp.isUndef()) return SDValue(); - SDValue AddMask = N->getOperand(3); - SDValue AddVL = N->getOperand(4); + auto [AddMask, AddVL] = [](SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + if (N->getOpcode() == ISD::ADD) { + SDLoc DL(N); + return getDefaultScalableVLOps(N->getSimpleValueType(0), DL, DAG, + Subtarget); + } + return std::make_pair(N->getOperand(3), N->getOperand(4)); + }(N, DAG, Subtarget); + SDValue MulMask = MulOp.getOperand(3); SDValue MulVL = MulOp.getOperand(4); @@ -15010,10 +15116,18 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return DAG.getNode(ISD::AND, DL, VT, NewFMV, DAG.getConstant(~SignBit, DL, VT)); } - case ISD::ADD: + case ISD::ADD: { + if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget)) + return V; + if (SDValue V = combineToVWMACC(N, DAG, Subtarget)) + return V; return performADDCombine(N, DAG, Subtarget); - case ISD::SUB: + } + case ISD::SUB: { + if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget)) + return V; return performSUBCombine(N, DAG, Subtarget); + } case ISD::AND: return performANDCombine(N, DCI, Subtarget); case ISD::OR: @@ -15021,6 +15135,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, case ISD::XOR: return performXORCombine(N, DAG, Subtarget); case ISD::MUL: + if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget)) + return V; return performMULCombine(N, DAG); case ISD::FADD: case ISD::UMAX: @@ -15497,7 +15613,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, break; } case RISCVISD::ADD_VL: - if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI)) + if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget)) return V; return combineToVWMACC(N, DAG, Subtarget); case RISCVISD::SUB_VL: @@ -15506,7 +15622,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, case RISCVISD::VWSUB_W_VL: case RISCVISD::VWSUBU_W_VL: case RISCVISD::MUL_VL: - return combineBinOp_VLToVWBinOp_VL(N, DCI); + return combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget); case RISCVISD::VFMADD_VL: case RISCVISD::VFNMADD_VL: case RISCVISD::VFMSUB_VL: diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll index 47d65c2593a4c..fc94f8c2a5279 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll @@ -1231,16 +1231,17 @@ define @ctlz_nxv1i64( %va) { ; ; CHECK-F-LABEL: ctlz_nxv1i64: ; CHECK-F: # %bb.0: -; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-F-NEXT: li a0, 190 +; CHECK-F-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-F-NEXT: vmv.v.x v9, a0 +; CHECK-F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vfncvt.f.xu.w v9, v8 -; CHECK-F-NEXT: vsrl.vi v8, v9, 23 -; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-F-NEXT: vzext.vf2 v9, v8 -; CHECK-F-NEXT: li a1, 190 -; CHECK-F-NEXT: vrsub.vx v8, v9, a1 +; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8 +; CHECK-F-NEXT: vsrl.vi v8, v10, 23 +; CHECK-F-NEXT: vwsubu.wv v9, v9, v8 ; CHECK-F-NEXT: li a1, 64 -; CHECK-F-NEXT: vminu.vx v8, v8, a1 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-F-NEXT: vminu.vx v8, v9, a1 ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; @@ -1371,16 +1372,17 @@ define @ctlz_nxv2i64( %va) { ; ; CHECK-F-LABEL: ctlz_nxv2i64: ; CHECK-F: # %bb.0: -; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-F-NEXT: li a0, 190 +; CHECK-F-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; CHECK-F-NEXT: vmv.v.x v10, a0 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8 -; CHECK-F-NEXT: vsrl.vi v8, v10, 23 -; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-F-NEXT: vzext.vf2 v10, v8 -; CHECK-F-NEXT: li a1, 190 -; CHECK-F-NEXT: vrsub.vx v8, v10, a1 +; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8 +; CHECK-F-NEXT: vsrl.vi v8, v12, 23 +; CHECK-F-NEXT: vwsubu.wv v10, v10, v8 ; CHECK-F-NEXT: li a1, 64 -; CHECK-F-NEXT: vminu.vx v8, v8, a1 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-F-NEXT: vminu.vx v8, v10, a1 ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; @@ -1511,16 +1513,17 @@ define @ctlz_nxv4i64( %va) { ; ; CHECK-F-LABEL: ctlz_nxv4i64: ; CHECK-F: # %bb.0: -; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-F-NEXT: li a0, 190 +; CHECK-F-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; CHECK-F-NEXT: vmv.v.x v12, a0 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8 -; CHECK-F-NEXT: vsrl.vi v8, v12, 23 -; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-F-NEXT: vzext.vf2 v12, v8 -; CHECK-F-NEXT: li a1, 190 -; CHECK-F-NEXT: vrsub.vx v8, v12, a1 +; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8 +; CHECK-F-NEXT: vsrl.vi v8, v16, 23 +; CHECK-F-NEXT: vwsubu.wv v12, v12, v8 ; CHECK-F-NEXT: li a1, 64 -; CHECK-F-NEXT: vminu.vx v8, v8, a1 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-F-NEXT: vminu.vx v8, v12, a1 ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; @@ -1651,16 +1654,17 @@ define @ctlz_nxv8i64( %va) { ; ; CHECK-F-LABEL: ctlz_nxv8i64: ; CHECK-F: # %bb.0: -; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-F-NEXT: li a0, 190 +; CHECK-F-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; CHECK-F-NEXT: vmv.v.x v16, a0 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8 -; CHECK-F-NEXT: vsrl.vi v8, v16, 23 -; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-F-NEXT: vzext.vf2 v16, v8 -; CHECK-F-NEXT: li a1, 190 -; CHECK-F-NEXT: vrsub.vx v8, v16, a1 +; CHECK-F-NEXT: vfncvt.f.xu.w v24, v8 +; CHECK-F-NEXT: vsrl.vi v8, v24, 23 +; CHECK-F-NEXT: vwsubu.wv v16, v16, v8 ; CHECK-F-NEXT: li a1, 64 -; CHECK-F-NEXT: vminu.vx v8, v8, a1 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-F-NEXT: vminu.vx v8, v16, a1 ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; @@ -2833,15 +2837,16 @@ define @ctlz_zero_undef_nxv1i64( %va) { ; ; CHECK-F-LABEL: ctlz_zero_undef_nxv1i64: ; CHECK-F: # %bb.0: -; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-F-NEXT: li a0, 190 +; CHECK-F-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-F-NEXT: vmv.v.x v9, a0 +; CHECK-F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vfncvt.f.xu.w v9, v8 -; CHECK-F-NEXT: vsrl.vi v8, v9, 23 -; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-F-NEXT: vzext.vf2 v9, v8 -; CHECK-F-NEXT: li a1, 190 -; CHECK-F-NEXT: vrsub.vx v8, v9, a1 +; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8 +; CHECK-F-NEXT: vsrl.vi v8, v10, 23 +; CHECK-F-NEXT: vwsubu.wv v9, v9, v8 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vmv1r.v v8, v9 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_zero_undef_nxv1i64: @@ -2968,15 +2973,16 @@ define @ctlz_zero_undef_nxv2i64( %va) { ; ; CHECK-F-LABEL: ctlz_zero_undef_nxv2i64: ; CHECK-F: # %bb.0: -; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-F-NEXT: li a0, 190 +; CHECK-F-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; CHECK-F-NEXT: vmv.v.x v10, a0 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8 -; CHECK-F-NEXT: vsrl.vi v8, v10, 23 -; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-F-NEXT: vzext.vf2 v10, v8 -; CHECK-F-NEXT: li a1, 190 -; CHECK-F-NEXT: vrsub.vx v8, v10, a1 +; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8 +; CHECK-F-NEXT: vsrl.vi v8, v12, 23 +; CHECK-F-NEXT: vwsubu.wv v10, v10, v8 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vmv2r.v v8, v10 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_zero_undef_nxv2i64: @@ -3103,15 +3109,16 @@ define @ctlz_zero_undef_nxv4i64( %va) { ; ; CHECK-F-LABEL: ctlz_zero_undef_nxv4i64: ; CHECK-F: # %bb.0: -; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-F-NEXT: li a0, 190 +; CHECK-F-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; CHECK-F-NEXT: vmv.v.x v12, a0 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8 -; CHECK-F-NEXT: vsrl.vi v8, v12, 23 -; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-F-NEXT: vzext.vf2 v12, v8 -; CHECK-F-NEXT: li a1, 190 -; CHECK-F-NEXT: vrsub.vx v8, v12, a1 +; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8 +; CHECK-F-NEXT: vsrl.vi v8, v16, 23 +; CHECK-F-NEXT: vwsubu.wv v12, v12, v8 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vmv4r.v v8, v12 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_zero_undef_nxv4i64: @@ -3238,14 +3245,15 @@ define @ctlz_zero_undef_nxv8i64( %va) { ; ; CHECK-F-LABEL: ctlz_zero_undef_nxv8i64: ; CHECK-F: # %bb.0: -; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-F-NEXT: vmv8r.v v16, v8 +; CHECK-F-NEXT: li a0, 190 +; CHECK-F-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; CHECK-F-NEXT: vmv.v.x v8, a0 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8 -; CHECK-F-NEXT: vsrl.vi v8, v16, 23 -; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-F-NEXT: vzext.vf2 v16, v8 -; CHECK-F-NEXT: li a1, 190 -; CHECK-F-NEXT: vrsub.vx v8, v16, a1 +; CHECK-F-NEXT: vfncvt.f.xu.w v24, v16 +; CHECK-F-NEXT: vsrl.vi v16, v24, 23 +; CHECK-F-NEXT: vwsubu.wv v8, v8, v16 ; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll index d99e3a7fe690a..972fa66917a56 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll @@ -1,25 +1,23 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING -; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING,RV32 +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING,RV32 +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING,RV64 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING,RV64 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,RV64 ; Check that the default value enables the web folding and ; that it is bigger than 3. -; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING -; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING - -; FIXME: We should use vwadd/vwsub/vwmul instructions. +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING,RV64 ; Check that the scalable vector add/sub/mul operations are all promoted into their ; vw counterpart when the folding of the web size is increased to 3. ; We need the web size to be at least 3 for the folding to happen, because ; %c has 3 uses. ; see https://github.com/llvm/llvm-project/pull/72340 -; FIXME: We don't currently use widening instructions. -define @vwop_vscale_sext_multiple_users(ptr %x, ptr %y, ptr %z) { -; NO_FOLDING-LABEL: vwop_vscale_sext_multiple_users: + +define @vwop_vscale_sext_i8i16_multiple_users(ptr %x, ptr %y, ptr %z) { +; NO_FOLDING-LABEL: vwop_vscale_sext_i8i16_multiple_users: ; NO_FOLDING: # %bb.0: ; NO_FOLDING-NEXT: vsetvli a3, zero, e16, mf2, ta, ma ; NO_FOLDING-NEXT: vle8.v v8, (a0) @@ -35,20 +33,18 @@ define @vwop_vscale_sext_multiple_users(ptr %x, ptr %y, ptr % ; NO_FOLDING-NEXT: vor.vv v8, v8, v9 ; NO_FOLDING-NEXT: ret ; -; FOLDING-LABEL: vwop_vscale_sext_multiple_users: +; FOLDING-LABEL: vwop_vscale_sext_i8i16_multiple_users: ; FOLDING: # %bb.0: -; FOLDING-NEXT: vsetvli a3, zero, e16, mf2, ta, ma +; FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; FOLDING-NEXT: vle8.v v8, (a0) ; FOLDING-NEXT: vle8.v v9, (a1) ; FOLDING-NEXT: vle8.v v10, (a2) -; FOLDING-NEXT: vsext.vf2 v11, v8 -; FOLDING-NEXT: vsext.vf2 v8, v9 -; FOLDING-NEXT: vsext.vf2 v9, v10 -; FOLDING-NEXT: vmul.vv v8, v11, v8 -; FOLDING-NEXT: vadd.vv v10, v11, v9 -; FOLDING-NEXT: vsub.vv v9, v11, v9 -; FOLDING-NEXT: vor.vv v8, v8, v10 -; FOLDING-NEXT: vor.vv v8, v8, v9 +; FOLDING-NEXT: vwmul.vv v11, v8, v9 +; FOLDING-NEXT: vwadd.vv v9, v8, v10 +; FOLDING-NEXT: vwsub.vv v12, v8, v10 +; FOLDING-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; FOLDING-NEXT: vor.vv v8, v11, v9 +; FOLDING-NEXT: vor.vv v8, v8, v12 ; FOLDING-NEXT: ret %a = load , ptr %x %b = load , ptr %y @@ -64,18 +60,220 @@ define @vwop_vscale_sext_multiple_users(ptr %x, ptr %y, ptr % ret %i } +define @vwop_vscale_sext_i16i32_multiple_users(ptr %x, ptr %y, ptr %z) { +; NO_FOLDING-LABEL: vwop_vscale_sext_i16i32_multiple_users: +; NO_FOLDING: # %bb.0: +; NO_FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; NO_FOLDING-NEXT: vle16.v v8, (a0) +; NO_FOLDING-NEXT: vle16.v v9, (a1) +; NO_FOLDING-NEXT: vle16.v v10, (a2) +; NO_FOLDING-NEXT: vsext.vf2 v11, v8 +; NO_FOLDING-NEXT: vsext.vf2 v8, v9 +; NO_FOLDING-NEXT: vsext.vf2 v9, v10 +; NO_FOLDING-NEXT: vmul.vv v8, v11, v8 +; NO_FOLDING-NEXT: vadd.vv v10, v11, v9 +; NO_FOLDING-NEXT: vsub.vv v9, v11, v9 +; NO_FOLDING-NEXT: vor.vv v8, v8, v10 +; NO_FOLDING-NEXT: vor.vv v8, v8, v9 +; NO_FOLDING-NEXT: ret +; +; FOLDING-LABEL: vwop_vscale_sext_i16i32_multiple_users: +; FOLDING: # %bb.0: +; FOLDING-NEXT: vsetvli a3, zero, e16, mf2, ta, ma +; FOLDING-NEXT: vle16.v v8, (a0) +; FOLDING-NEXT: vle16.v v9, (a1) +; FOLDING-NEXT: vle16.v v10, (a2) +; FOLDING-NEXT: vwmul.vv v11, v8, v9 +; FOLDING-NEXT: vwadd.vv v9, v8, v10 +; FOLDING-NEXT: vwsub.vv v12, v8, v10 +; FOLDING-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; FOLDING-NEXT: vor.vv v8, v11, v9 +; FOLDING-NEXT: vor.vv v8, v8, v12 +; FOLDING-NEXT: ret + %a = load , ptr %x + %b = load , ptr %y + %b2 = load , ptr %z + %c = sext %a to + %d = sext %b to + %d2 = sext %b2 to + %e = mul %c, %d + %f = add %c, %d2 + %g = sub %c, %d2 + %h = or %e, %f + %i = or %h, %g + ret %i +} + +define @vwop_vscale_sext_i32i64_multiple_users(ptr %x, ptr %y, ptr %z) { +; NO_FOLDING-LABEL: vwop_vscale_sext_i32i64_multiple_users: +; NO_FOLDING: # %bb.0: +; NO_FOLDING-NEXT: vl1re32.v v8, (a0) +; NO_FOLDING-NEXT: vl1re32.v v9, (a1) +; NO_FOLDING-NEXT: vl1re32.v v10, (a2) +; NO_FOLDING-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; NO_FOLDING-NEXT: vsext.vf2 v12, v8 +; NO_FOLDING-NEXT: vsext.vf2 v14, v9 +; NO_FOLDING-NEXT: vsext.vf2 v8, v10 +; NO_FOLDING-NEXT: vmul.vv v10, v12, v14 +; NO_FOLDING-NEXT: vadd.vv v14, v12, v8 +; NO_FOLDING-NEXT: vsub.vv v8, v12, v8 +; NO_FOLDING-NEXT: vor.vv v10, v10, v14 +; NO_FOLDING-NEXT: vor.vv v8, v10, v8 +; NO_FOLDING-NEXT: ret +; +; FOLDING-LABEL: vwop_vscale_sext_i32i64_multiple_users: +; FOLDING: # %bb.0: +; FOLDING-NEXT: vl1re32.v v8, (a0) +; FOLDING-NEXT: vl1re32.v v9, (a1) +; FOLDING-NEXT: vl1re32.v v10, (a2) +; FOLDING-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; FOLDING-NEXT: vwmul.vv v12, v8, v9 +; FOLDING-NEXT: vwadd.vv v14, v8, v10 +; FOLDING-NEXT: vwsub.vv v16, v8, v10 +; FOLDING-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; FOLDING-NEXT: vor.vv v8, v12, v14 +; FOLDING-NEXT: vor.vv v8, v8, v16 +; FOLDING-NEXT: ret + %a = load , ptr %x + %b = load , ptr %y + %b2 = load , ptr %z + %c = sext %a to + %d = sext %b to + %d2 = sext %b2 to + %e = mul %c, %d + %f = add %c, %d2 + %g = sub %c, %d2 + %h = or %e, %f + %i = or %h, %g + ret %i +} +define @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y, ptr %z) { +; RV32-LABEL: vwop_vscale_sext_i1i32_multiple_users: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, mu +; RV32-NEXT: vlm.v v8, (a0) +; RV32-NEXT: vlm.v v9, (a1) +; RV32-NEXT: vlm.v v10, (a2) +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vmv.v.v v0, v8 +; RV32-NEXT: vmerge.vim v12, v11, -1, v0 +; RV32-NEXT: vmv.v.v v0, v9 +; RV32-NEXT: vmerge.vim v9, v11, -1, v0 +; RV32-NEXT: vmv.v.v v0, v10 +; RV32-NEXT: vmerge.vim v10, v11, -1, v0 +; RV32-NEXT: vmul.vv v9, v12, v9 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vv v11, v12, v10 +; RV32-NEXT: vmv.v.v v0, v8 +; RV32-NEXT: vsub.vx v10, v10, a0, v0.t +; RV32-NEXT: vor.vv v8, v9, v10 +; RV32-NEXT: vor.vv v8, v8, v11 +; RV32-NEXT: ret +; +; RV64-LABEL: vwop_vscale_sext_i1i32_multiple_users: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; RV64-NEXT: vlm.v v8, (a0) +; RV64-NEXT: vlm.v v9, (a1) +; RV64-NEXT: vlm.v v10, (a2) +; RV64-NEXT: vmv.v.i v11, 0 +; RV64-NEXT: vmv.v.v v0, v8 +; RV64-NEXT: vmerge.vim v12, v11, -1, v0 +; RV64-NEXT: vmv.v.v v0, v9 +; RV64-NEXT: vmerge.vim v9, v11, -1, v0 +; RV64-NEXT: vmv.v.v v0, v10 +; RV64-NEXT: vmerge.vim v10, v11, -1, v0 +; RV64-NEXT: vmul.vv v9, v12, v9 +; RV64-NEXT: vmv.v.v v0, v8 +; RV64-NEXT: vmerge.vim v8, v11, 1, v0 +; RV64-NEXT: vsub.vv v8, v10, v8 +; RV64-NEXT: vsub.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v9, v8 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret + %a = load , ptr %x + %b = load , ptr %y + %b2 = load , ptr %z + %c = sext %a to + %d = sext %b to + %d2 = sext %b2 to + %e = mul %c, %d + %f = add %c, %d2 + %g = sub %c, %d2 + %h = or %e, %f + %i = or %h, %g + ret %i +} -define @vwop_vscale_zext_multiple_users(ptr %x, ptr %y, ptr %z) { -; NO_FOLDING-LABEL: vwop_vscale_zext_multiple_users: +define @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, ptr %z) { +; NO_FOLDING-LABEL: vwop_vscale_sext_i1i8_multiple_users: ; NO_FOLDING: # %bb.0: -; NO_FOLDING-NEXT: vsetvli a3, zero, e16, mf2, ta, ma +; NO_FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; NO_FOLDING-NEXT: vlm.v v8, (a0) +; NO_FOLDING-NEXT: vlm.v v9, (a1) +; NO_FOLDING-NEXT: vlm.v v10, (a2) +; NO_FOLDING-NEXT: vmv.v.i v11, 0 +; NO_FOLDING-NEXT: vmv1r.v v0, v8 +; NO_FOLDING-NEXT: vmerge.vim v12, v11, -1, v0 +; NO_FOLDING-NEXT: vmv1r.v v0, v9 +; NO_FOLDING-NEXT: vmerge.vim v9, v11, -1, v0 +; NO_FOLDING-NEXT: vmv1r.v v0, v10 +; NO_FOLDING-NEXT: vmerge.vim v10, v11, -1, v0 +; NO_FOLDING-NEXT: vmul.vv v9, v12, v9 +; NO_FOLDING-NEXT: vmv1r.v v0, v8 +; NO_FOLDING-NEXT: vmerge.vim v8, v11, 1, v0 +; NO_FOLDING-NEXT: vsub.vv v8, v10, v8 +; NO_FOLDING-NEXT: vsub.vv v10, v12, v10 +; NO_FOLDING-NEXT: vor.vv v8, v9, v8 +; NO_FOLDING-NEXT: vor.vv v8, v8, v10 +; NO_FOLDING-NEXT: ret +; +; FOLDING-LABEL: vwop_vscale_sext_i1i8_multiple_users: +; FOLDING: # %bb.0: +; FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; FOLDING-NEXT: vlm.v v8, (a0) +; FOLDING-NEXT: vlm.v v9, (a1) +; FOLDING-NEXT: vlm.v v10, (a2) +; FOLDING-NEXT: vmv.v.i v11, 0 +; FOLDING-NEXT: vmv1r.v v0, v8 +; FOLDING-NEXT: vmerge.vim v12, v11, -1, v0 +; FOLDING-NEXT: vmv1r.v v0, v9 +; FOLDING-NEXT: vmerge.vim v9, v11, -1, v0 +; FOLDING-NEXT: vmv1r.v v0, v10 +; FOLDING-NEXT: vmerge.vim v10, v11, -1, v0 +; FOLDING-NEXT: vmul.vv v9, v12, v9 +; FOLDING-NEXT: vmv1r.v v0, v8 +; FOLDING-NEXT: vmerge.vim v8, v11, 1, v0 +; FOLDING-NEXT: vsub.vv v8, v10, v8 +; FOLDING-NEXT: vsub.vv v10, v12, v10 +; FOLDING-NEXT: vor.vv v8, v9, v8 +; FOLDING-NEXT: vor.vv v8, v8, v10 +; FOLDING-NEXT: ret + %a = load , ptr %x + %b = load , ptr %y + %b2 = load , ptr %z + %c = sext %a to + %d = sext %b to + %d2 = sext %b2 to + %e = mul %c, %d + %f = add %c, %d2 + %g = sub %c, %d2 + %h = or %e, %f + %i = or %h, %g + ret %i +} + +define @vwop_vscale_sext_i8i32_multiple_users(ptr %x, ptr %y, ptr %z) { +; NO_FOLDING-LABEL: vwop_vscale_sext_i8i32_multiple_users: +; NO_FOLDING: # %bb.0: +; NO_FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, ma ; NO_FOLDING-NEXT: vle8.v v8, (a0) ; NO_FOLDING-NEXT: vle8.v v9, (a1) ; NO_FOLDING-NEXT: vle8.v v10, (a2) -; NO_FOLDING-NEXT: vzext.vf2 v11, v8 -; NO_FOLDING-NEXT: vzext.vf2 v8, v9 -; NO_FOLDING-NEXT: vzext.vf2 v9, v10 +; NO_FOLDING-NEXT: vsext.vf4 v11, v8 +; NO_FOLDING-NEXT: vsext.vf4 v8, v9 +; NO_FOLDING-NEXT: vsext.vf4 v9, v10 ; NO_FOLDING-NEXT: vmul.vv v8, v11, v8 ; NO_FOLDING-NEXT: vadd.vv v10, v11, v9 ; NO_FOLDING-NEXT: vsub.vv v9, v11, v9 @@ -83,20 +281,64 @@ define @vwop_vscale_zext_multiple_users(ptr %x, ptr %y, ptr % ; NO_FOLDING-NEXT: vor.vv v8, v8, v9 ; NO_FOLDING-NEXT: ret ; -; FOLDING-LABEL: vwop_vscale_zext_multiple_users: +; FOLDING-LABEL: vwop_vscale_sext_i8i32_multiple_users: ; FOLDING: # %bb.0: -; FOLDING-NEXT: vsetvli a3, zero, e16, mf2, ta, ma +; FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, ma ; FOLDING-NEXT: vle8.v v8, (a0) ; FOLDING-NEXT: vle8.v v9, (a1) ; FOLDING-NEXT: vle8.v v10, (a2) -; FOLDING-NEXT: vzext.vf2 v11, v8 -; FOLDING-NEXT: vzext.vf2 v8, v9 -; FOLDING-NEXT: vzext.vf2 v9, v10 +; FOLDING-NEXT: vsext.vf4 v11, v8 +; FOLDING-NEXT: vsext.vf4 v8, v9 +; FOLDING-NEXT: vsext.vf4 v9, v10 ; FOLDING-NEXT: vmul.vv v8, v11, v8 ; FOLDING-NEXT: vadd.vv v10, v11, v9 ; FOLDING-NEXT: vsub.vv v9, v11, v9 ; FOLDING-NEXT: vor.vv v8, v8, v10 ; FOLDING-NEXT: vor.vv v8, v8, v9 +; FOLDING-NEXT: ret + %a = load , ptr %x + %b = load , ptr %y + %b2 = load , ptr %z + %c = sext %a to + %d = sext %b to + %d2 = sext %b2 to + %e = mul %c, %d + %f = add %c, %d2 + %g = sub %c, %d2 + %h = or %e, %f + %i = or %h, %g + ret %i +} + +define @vwop_vscale_zext_i8i16_multiple_users(ptr %x, ptr %y, ptr %z) { +; NO_FOLDING-LABEL: vwop_vscale_zext_i8i16_multiple_users: +; NO_FOLDING: # %bb.0: +; NO_FOLDING-NEXT: vsetvli a3, zero, e16, mf2, ta, ma +; NO_FOLDING-NEXT: vle8.v v8, (a0) +; NO_FOLDING-NEXT: vle8.v v9, (a1) +; NO_FOLDING-NEXT: vle8.v v10, (a2) +; NO_FOLDING-NEXT: vzext.vf2 v11, v8 +; NO_FOLDING-NEXT: vzext.vf2 v8, v9 +; NO_FOLDING-NEXT: vzext.vf2 v9, v10 +; NO_FOLDING-NEXT: vmul.vv v8, v11, v8 +; NO_FOLDING-NEXT: vadd.vv v10, v11, v9 +; NO_FOLDING-NEXT: vsub.vv v9, v11, v9 +; NO_FOLDING-NEXT: vor.vv v8, v8, v10 +; NO_FOLDING-NEXT: vor.vv v8, v8, v9 +; NO_FOLDING-NEXT: ret +; +; FOLDING-LABEL: vwop_vscale_zext_i8i16_multiple_users: +; FOLDING: # %bb.0: +; FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; FOLDING-NEXT: vle8.v v8, (a0) +; FOLDING-NEXT: vle8.v v9, (a1) +; FOLDING-NEXT: vle8.v v10, (a2) +; FOLDING-NEXT: vwmulu.vv v11, v8, v9 +; FOLDING-NEXT: vwaddu.vv v9, v8, v10 +; FOLDING-NEXT: vwsubu.vv v12, v8, v10 +; FOLDING-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; FOLDING-NEXT: vor.vv v8, v11, v9 +; FOLDING-NEXT: vor.vv v8, v8, v12 ; FOLDING-NEXT: ret %a = load , ptr %x %b = load , ptr %y @@ -111,3 +353,242 @@ define @vwop_vscale_zext_multiple_users(ptr %x, ptr %y, ptr % %i = or %h, %g ret %i } + +define @vwop_vscale_zext_i16i32_multiple_users(ptr %x, ptr %y, ptr %z) { +; NO_FOLDING-LABEL: vwop_vscale_zext_i16i32_multiple_users: +; NO_FOLDING: # %bb.0: +; NO_FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; NO_FOLDING-NEXT: vle16.v v8, (a0) +; NO_FOLDING-NEXT: vle16.v v9, (a1) +; NO_FOLDING-NEXT: vle16.v v10, (a2) +; NO_FOLDING-NEXT: vzext.vf2 v11, v8 +; NO_FOLDING-NEXT: vzext.vf2 v8, v9 +; NO_FOLDING-NEXT: vzext.vf2 v9, v10 +; NO_FOLDING-NEXT: vmul.vv v8, v11, v8 +; NO_FOLDING-NEXT: vadd.vv v10, v11, v9 +; NO_FOLDING-NEXT: vsub.vv v9, v11, v9 +; NO_FOLDING-NEXT: vor.vv v8, v8, v10 +; NO_FOLDING-NEXT: vor.vv v8, v8, v9 +; NO_FOLDING-NEXT: ret +; +; FOLDING-LABEL: vwop_vscale_zext_i16i32_multiple_users: +; FOLDING: # %bb.0: +; FOLDING-NEXT: vsetvli a3, zero, e16, mf2, ta, ma +; FOLDING-NEXT: vle16.v v8, (a0) +; FOLDING-NEXT: vle16.v v9, (a1) +; FOLDING-NEXT: vle16.v v10, (a2) +; FOLDING-NEXT: vwmulu.vv v11, v8, v9 +; FOLDING-NEXT: vwaddu.vv v9, v8, v10 +; FOLDING-NEXT: vwsubu.vv v12, v8, v10 +; FOLDING-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; FOLDING-NEXT: vor.vv v8, v11, v9 +; FOLDING-NEXT: vor.vv v8, v8, v12 +; FOLDING-NEXT: ret + %a = load , ptr %x + %b = load , ptr %y + %b2 = load , ptr %z + %c = zext %a to + %d = zext %b to + %d2 = zext %b2 to + %e = mul %c, %d + %f = add %c, %d2 + %g = sub %c, %d2 + %h = or %e, %f + %i = or %h, %g + ret %i +} + +define @vwop_vscale_zext_i32i64_multiple_users(ptr %x, ptr %y, ptr %z) { +; NO_FOLDING-LABEL: vwop_vscale_zext_i32i64_multiple_users: +; NO_FOLDING: # %bb.0: +; NO_FOLDING-NEXT: vl1re32.v v8, (a0) +; NO_FOLDING-NEXT: vl1re32.v v9, (a1) +; NO_FOLDING-NEXT: vl1re32.v v10, (a2) +; NO_FOLDING-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; NO_FOLDING-NEXT: vzext.vf2 v12, v8 +; NO_FOLDING-NEXT: vzext.vf2 v14, v9 +; NO_FOLDING-NEXT: vzext.vf2 v8, v10 +; NO_FOLDING-NEXT: vmul.vv v10, v12, v14 +; NO_FOLDING-NEXT: vadd.vv v14, v12, v8 +; NO_FOLDING-NEXT: vsub.vv v8, v12, v8 +; NO_FOLDING-NEXT: vor.vv v10, v10, v14 +; NO_FOLDING-NEXT: vor.vv v8, v10, v8 +; NO_FOLDING-NEXT: ret +; +; FOLDING-LABEL: vwop_vscale_zext_i32i64_multiple_users: +; FOLDING: # %bb.0: +; FOLDING-NEXT: vl1re32.v v8, (a0) +; FOLDING-NEXT: vl1re32.v v9, (a1) +; FOLDING-NEXT: vl1re32.v v10, (a2) +; FOLDING-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; FOLDING-NEXT: vwmulu.vv v12, v8, v9 +; FOLDING-NEXT: vwaddu.vv v14, v8, v10 +; FOLDING-NEXT: vwsubu.vv v16, v8, v10 +; FOLDING-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; FOLDING-NEXT: vor.vv v8, v12, v14 +; FOLDING-NEXT: vor.vv v8, v8, v16 +; FOLDING-NEXT: ret + %a = load , ptr %x + %b = load , ptr %y + %b2 = load , ptr %z + %c = zext %a to + %d = zext %b to + %d2 = zext %b2 to + %e = mul %c, %d + %f = add %c, %d2 + %g = sub %c, %d2 + %h = or %e, %f + %i = or %h, %g + ret %i +} + +define @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y, ptr %z) { +; RV32-LABEL: vwop_vscale_zext_i1i32_multiple_users: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, mu +; RV32-NEXT: vlm.v v0, (a0) +; RV32-NEXT: vlm.v v8, (a2) +; RV32-NEXT: vlm.v v9, (a1) +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vmerge.vim v11, v10, 1, v0 +; RV32-NEXT: vmv.v.v v0, v8 +; RV32-NEXT: vmerge.vim v8, v10, 1, v0 +; RV32-NEXT: vadd.vv v10, v11, v8 +; RV32-NEXT: vsub.vv v8, v11, v8 +; RV32-NEXT: vmv.v.v v0, v9 +; RV32-NEXT: vor.vv v10, v10, v11, v0.t +; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vwop_vscale_zext_i1i32_multiple_users: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; RV64-NEXT: vlm.v v0, (a0) +; RV64-NEXT: vlm.v v8, (a1) +; RV64-NEXT: vlm.v v9, (a2) +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vmerge.vim v11, v10, 1, v0 +; RV64-NEXT: vmv.v.v v0, v8 +; RV64-NEXT: vmerge.vim v8, v10, 1, v0 +; RV64-NEXT: vmv.v.v v0, v9 +; RV64-NEXT: vmerge.vim v9, v10, 1, v0 +; RV64-NEXT: vmul.vv v8, v11, v8 +; RV64-NEXT: vadd.vv v10, v11, v9 +; RV64-NEXT: vsub.vv v9, v11, v9 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: ret + %a = load , ptr %x + %b = load , ptr %y + %b2 = load , ptr %z + %c = zext %a to + %d = zext %b to + %d2 = zext %b2 to + %e = mul %c, %d + %f = add %c, %d2 + %g = sub %c, %d2 + %h = or %e, %f + %i = or %h, %g + ret %i +} + +define @vwop_vscale_zext_i1i8_multiple_users(ptr %x, ptr %y, ptr %z) { +; NO_FOLDING-LABEL: vwop_vscale_zext_i1i8_multiple_users: +; NO_FOLDING: # %bb.0: +; NO_FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; NO_FOLDING-NEXT: vlm.v v0, (a0) +; NO_FOLDING-NEXT: vlm.v v8, (a1) +; NO_FOLDING-NEXT: vlm.v v9, (a2) +; NO_FOLDING-NEXT: vmv.v.i v10, 0 +; NO_FOLDING-NEXT: vmerge.vim v11, v10, 1, v0 +; NO_FOLDING-NEXT: vmv1r.v v0, v8 +; NO_FOLDING-NEXT: vmerge.vim v8, v10, 1, v0 +; NO_FOLDING-NEXT: vmv1r.v v0, v9 +; NO_FOLDING-NEXT: vmerge.vim v9, v10, 1, v0 +; NO_FOLDING-NEXT: vmul.vv v8, v11, v8 +; NO_FOLDING-NEXT: vadd.vv v10, v11, v9 +; NO_FOLDING-NEXT: vsub.vv v9, v11, v9 +; NO_FOLDING-NEXT: vor.vv v8, v8, v10 +; NO_FOLDING-NEXT: vor.vv v8, v8, v9 +; NO_FOLDING-NEXT: ret +; +; FOLDING-LABEL: vwop_vscale_zext_i1i8_multiple_users: +; FOLDING: # %bb.0: +; FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; FOLDING-NEXT: vlm.v v0, (a0) +; FOLDING-NEXT: vlm.v v8, (a1) +; FOLDING-NEXT: vlm.v v9, (a2) +; FOLDING-NEXT: vmv.v.i v10, 0 +; FOLDING-NEXT: vmerge.vim v11, v10, 1, v0 +; FOLDING-NEXT: vmv1r.v v0, v8 +; FOLDING-NEXT: vmerge.vim v8, v10, 1, v0 +; FOLDING-NEXT: vmv1r.v v0, v9 +; FOLDING-NEXT: vmerge.vim v9, v10, 1, v0 +; FOLDING-NEXT: vmul.vv v8, v11, v8 +; FOLDING-NEXT: vadd.vv v10, v11, v9 +; FOLDING-NEXT: vsub.vv v9, v11, v9 +; FOLDING-NEXT: vor.vv v8, v8, v10 +; FOLDING-NEXT: vor.vv v8, v8, v9 +; FOLDING-NEXT: ret + %a = load , ptr %x + %b = load , ptr %y + %b2 = load , ptr %z + %c = zext %a to + %d = zext %b to + %d2 = zext %b2 to + %e = mul %c, %d + %f = add %c, %d2 + %g = sub %c, %d2 + %h = or %e, %f + %i = or %h, %g + ret %i +} + +define @vwop_vscale_zext_i8i32_multiple_users(ptr %x, ptr %y, ptr %z) { +; NO_FOLDING-LABEL: vwop_vscale_zext_i8i32_multiple_users: +; NO_FOLDING: # %bb.0: +; NO_FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; NO_FOLDING-NEXT: vle8.v v8, (a0) +; NO_FOLDING-NEXT: vle8.v v9, (a1) +; NO_FOLDING-NEXT: vle8.v v10, (a2) +; NO_FOLDING-NEXT: vzext.vf4 v11, v8 +; NO_FOLDING-NEXT: vzext.vf4 v8, v9 +; NO_FOLDING-NEXT: vzext.vf4 v9, v10 +; NO_FOLDING-NEXT: vmul.vv v8, v11, v8 +; NO_FOLDING-NEXT: vadd.vv v10, v11, v9 +; NO_FOLDING-NEXT: vsub.vv v9, v11, v9 +; NO_FOLDING-NEXT: vor.vv v8, v8, v10 +; NO_FOLDING-NEXT: vor.vv v8, v8, v9 +; NO_FOLDING-NEXT: ret +; +; FOLDING-LABEL: vwop_vscale_zext_i8i32_multiple_users: +; FOLDING: # %bb.0: +; FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; FOLDING-NEXT: vle8.v v8, (a0) +; FOLDING-NEXT: vle8.v v9, (a1) +; FOLDING-NEXT: vle8.v v10, (a2) +; FOLDING-NEXT: vzext.vf4 v11, v8 +; FOLDING-NEXT: vzext.vf4 v8, v9 +; FOLDING-NEXT: vzext.vf4 v9, v10 +; FOLDING-NEXT: vmul.vv v8, v11, v8 +; FOLDING-NEXT: vadd.vv v10, v11, v9 +; FOLDING-NEXT: vsub.vv v9, v11, v9 +; FOLDING-NEXT: vor.vv v8, v8, v10 +; FOLDING-NEXT: vor.vv v8, v8, v9 +; FOLDING-NEXT: ret + %a = load , ptr %x + %b = load , ptr %y + %b2 = load , ptr %z + %c = zext %a to + %d = zext %b to + %d2 = zext %b2 to + %e = mul %c, %d + %f = add %c, %d2 + %g = sub %c, %d2 + %h = or %e, %f + %i = or %h, %g + ret %i +} + + +