From 456f07945118b7bff27f4733415c440d5e3566a0 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 3 Dec 2024 14:52:40 +0000 Subject: [PATCH 1/3] [SelectionDAG] Add an ISD node for vector.extract.last.active Since we shouldn't be changing lowering in SelectionDAGBuilder based on the target, introduce a new ISD node for extract.last.active and perform the current lowering in LegalizeVectorOps. This results in worse codegen for now, but it's easy for a target to match a single ISD node and improve the output. --- llvm/include/llvm/CodeGen/ISDOpcodes.h | 4 + .../SelectionDAG/LegalizeIntegerTypes.cpp | 24 ++ llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 4 + .../SelectionDAG/LegalizeTypesGeneric.cpp | 61 ++++ .../SelectionDAG/LegalizeVectorOps.cpp | 80 +++++ .../SelectionDAG/SelectionDAGBuilder.cpp | 37 +-- .../SelectionDAG/SelectionDAGDumper.cpp | 3 + llvm/lib/CodeGen/TargetLoweringBase.cpp | 3 + .../Target/AArch64/AArch64ISelLowering.cpp | 10 + .../AArch64/vector-extract-last-active.ll | 300 ++++++++++-------- .../RISCV/rvv/vector-extract-last-active.ll | 165 ++++++---- 11 files changed, 462 insertions(+), 229 deletions(-) diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 604dc9419025b..d2ed8ec2e7466 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1480,6 +1480,10 @@ enum NodeType { // Output: Output Chain EXPERIMENTAL_VECTOR_HISTOGRAM, + // experimental.vector.extract.last.active intrinsic + // Operands: Data, Mask, PassThru + VECTOR_EXTRACT_LAST_ACTIVE, + // llvm.clear_cache intrinsic // Operands: Input Chain, Start Addres, End Address // Outputs: Output Chain diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index be7521f341685..6d75f0788203f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -155,6 +155,10 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::ZERO_EXTEND_VECTOR_INREG: Res = PromoteIntRes_EXTEND_VECTOR_INREG(N); break; + case ISD::VECTOR_EXTRACT_LAST_ACTIVE: + Res = PromoteIntRes_VECTOR_EXTRACT_LAST_ACTIVE(N); + break; + case ISD::SIGN_EXTEND: case ISD::VP_SIGN_EXTEND: case ISD::ZERO_EXTEND: @@ -2069,6 +2073,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: Res = PromoteIntOp_VECTOR_HISTOGRAM(N, OpNo); break; + case ISD::VECTOR_EXTRACT_LAST_ACTIVE: + Res = PromoteIntOp_VECTOR_EXTRACT_LAST_ACTIVE(N, OpNo); + break; } // If the result is null, the sub-method took care of registering results etc. @@ -2810,6 +2817,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } +SDValue +DAGTypeLegalizer::PromoteIntOp_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N, + unsigned OpNo) { + SmallVector NewOps(N->ops()); + NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo)); + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + //===----------------------------------------------------------------------===// // Integer Result Expansion //===----------------------------------------------------------------------===// @@ -2848,6 +2863,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::BUILD_PAIR: ExpandRes_BUILD_PAIR(N, Lo, Hi); break; case ISD::EXTRACT_ELEMENT: ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break; case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break; + case ISD::VECTOR_EXTRACT_LAST_ACTIVE: + ExpandRes_VECTOR_EXTRACT_LAST_ACTIVE(N, Lo, Hi); + break; case ISD::VAARG: ExpandRes_VAARG(N, Lo, Hi); break; case ISD::ANY_EXTEND: ExpandIntRes_ANY_EXTEND(N, Lo, Hi); break; @@ -6124,6 +6142,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTEND_VECTOR_INREG(SDNode *N) { return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0)); } +SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N) { + EVT VT = N->getValueType(0); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + return DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, SDLoc(N), NVT, N->ops()); +} + SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) { EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 571a710cc92a3..0fc51c33d5f18 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -378,6 +378,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntRes_VPFunnelShift(SDNode *N); SDValue PromoteIntRes_IS_FPCLASS(SDNode *N); SDValue PromoteIntRes_PATCHPOINT(SDNode *N); + SDValue PromoteIntRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N); // Integer Operand Promotion. bool PromoteIntegerOperand(SDNode *N, unsigned OpNo); @@ -428,6 +429,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, unsigned OpNo); + SDValue PromoteIntOp_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N, unsigned OpNo); void SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -1215,6 +1217,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void ExpandRes_BUILD_PAIR (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandRes_EXTRACT_ELEMENT (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N, SDValue &Lo, + SDValue &Hi); void ExpandRes_NormalLoad (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandRes_VAARG (SDNode *N, SDValue &Lo, SDValue &Hi); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index 113a3bc0bbea6..f7d4800487d60 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -19,6 +19,7 @@ //===----------------------------------------------------------------------===// #include "LegalizeTypes.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DataLayout.h" using namespace llvm; @@ -244,6 +245,66 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo, std::swap(Lo, Hi); } +void DAGTypeLegalizer::ExpandRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N, + SDValue &Lo, + SDValue &Hi) { + SDValue Data = N->getOperand(0); + SDValue Mask = N->getOperand(1); + SDValue PassThru = N->getOperand(2); + + ElementCount OldEltCount = Data.getValueType().getVectorElementCount(); + EVT OldEltVT = Data.getValueType().getVectorElementType(); + SDLoc dl(N); + + EVT OldVT = N->getValueType(0); + EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT); + + if (OldVT != OldEltVT) { + // The result of EXTRACT_LAST_ACTIVE may be larger than the element type of + // the input vector. If so, extend the elements of the input vector to the + // same bitwidth as the result before expanding. + assert(OldEltVT.bitsLT(OldVT) && "Result type smaller then element type!"); + EVT NVecVT = EVT::getVectorVT(*DAG.getContext(), OldVT, OldEltCount); + Data = DAG.getNode(ISD::ANY_EXTEND, dl, NVecVT, N->getOperand(0)); + } + + SDValue NewVec = DAG.getNode( + ISD::BITCAST, dl, + EVT::getVectorVT(*DAG.getContext(), NewVT, OldEltCount * 2), Data); + + auto [DataLo, DataHi] = DAG.SplitVector(NewVec, dl); + auto [PassLo, PassHi] = DAG.SplitScalar(PassThru, dl, NewVT, NewVT); + + EVT SplitVT = DataLo.getValueType(); + + // TODO: I *think* this works correctly, but I haven't confirmed it yet by + // actually running a compiled program with example data. + // + // We want the matching lo and hi parts from whichever lane was the last + // active. + SDValue Deinterleaved; + if (SplitVT.isFixedLengthVector()) { + unsigned SplitNum = SplitVT.getVectorMinNumElements(); + SDValue Even = DAG.getVectorShuffle(SplitVT, dl, DataLo, DataHi, + createStrideMask(0, 2, SplitNum)); + SDValue Odd = DAG.getVectorShuffle(SplitVT, dl, DataLo, DataHi, + createStrideMask(1, 2, SplitNum)); + Deinterleaved = DAG.getMergeValues({Even, Odd}, dl); + } else + Deinterleaved = + DAG.getNode(ISD::VECTOR_DEINTERLEAVE, dl, + DAG.getVTList(SplitVT, SplitVT), DataLo, DataHi); + + Lo = DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, dl, NewVT, + Deinterleaved.getValue(0), Mask, PassLo); + Hi = DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, dl, NewVT, + Deinterleaved.getValue(1), Mask, PassHi); + + // FIXME: Endianness? + assert(!DAG.getDataLayout().isBigEndian() && + "Implement big endian result expansion for extract_last_active"); +} + void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo, SDValue &Hi) { assert(ISD::isNormalLoad(N) && "This routine only for normal loads!"); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index e8404a13009a7..8026e5c845b41 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -29,6 +29,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/SelectionDAG.h" @@ -138,6 +139,7 @@ class VectorLegalizer { SDValue ExpandVP_FNEG(SDNode *Node); SDValue ExpandVP_FABS(SDNode *Node); SDValue ExpandVP_FCOPYSIGN(SDNode *Node); + SDValue ExpandVECTOR_EXTRACT_LAST_ACTIVE(SDNode *Node); SDValue ExpandSELECT(SDNode *Node); std::pair ExpandLoad(SDNode *N); SDValue ExpandStore(SDNode *N); @@ -467,6 +469,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::VECTOR_COMPRESS: case ISD::SCMP: case ISD::UCMP: + case ISD::VECTOR_EXTRACT_LAST_ACTIVE: Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); break; case ISD::SMULFIX: @@ -1208,6 +1211,9 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { case ISD::VECTOR_COMPRESS: Results.push_back(TLI.expandVECTOR_COMPRESS(Node, DAG)); return; + case ISD::VECTOR_EXTRACT_LAST_ACTIVE: + Results.push_back(ExpandVECTOR_EXTRACT_LAST_ACTIVE(Node)); + return; case ISD::SCMP: case ISD::UCMP: Results.push_back(TLI.expandCMP(Node, DAG)); @@ -1719,6 +1725,80 @@ SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) { return DAG.getNode(ISD::BITCAST, DL, VT, CopiedSign); } +SDValue VectorLegalizer::ExpandVECTOR_EXTRACT_LAST_ACTIVE(SDNode *Node) { + SDLoc DL(Node); + SDValue Data = Node->getOperand(0); + SDValue Mask = Node->getOperand(1); + SDValue PassThru = Node->getOperand(2); + + EVT DataVT = Data.getValueType(); + EVT ScalarVT = PassThru.getValueType(); + EVT BoolVT = Mask.getValueType().getScalarType(); + + // Find a suitable type for a stepvector. + ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value. + if (DataVT.isScalableVector()) + VScaleRange = getVScaleRange(&DAG.getMachineFunction().getFunction(), 64); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned EltWidth = TLI.getBitWidthForCttzElements( + ScalarVT.getTypeForEVT(*DAG.getContext()), DataVT.getVectorElementCount(), + /*ZeroIsPoison=*/true, &VScaleRange); + + // HACK: If the target selects a VT that's too wide based on the legal types + // for a vecreduce_umax, if will force expansion of the node -- which + // doesn't work on scalable vectors... + // Is there another method we could use to get a smaller VT instead + // of just capping to 32b? + EVT StepVT = MVT::getIntegerVT(std::min(EltWidth, 32u)); + EVT StepVecVT = DataVT.changeVectorElementType(StepVT); + + // HACK: If the target selects a VT that's too small to form a legal vector + // type, we also run into problems trying to expand the vecreduce_umax. + // + // I think perhaps we need to revisit how getBitWidthForCttzElements + // works... + if (TLI.getTypeAction(StepVecVT.getSimpleVT()) == + TargetLowering::TypePromoteInteger) { + StepVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), StepVecVT); + StepVT = StepVecVT.getVectorElementType(); + } + + // Zero out lanes with inactive elements, then find the highest remaining + // value from the stepvector. + SDValue Zeroes = DAG.getConstant(0, DL, StepVecVT); + SDValue StepVec = DAG.getStepVector(DL, StepVecVT); + SDValue ActiveElts = DAG.getSelect(DL, StepVecVT, Mask, StepVec, Zeroes); + + // HACK: Unfortunately, LegalizeVectorOps does not recursively legalize *all* + // added nodes, just the end result nodes until it finds legal ops. + // LegalizeDAG doesn't handle VSELECT at all presently. So if we need to + // legalize a vselect then we have to do it here. + // + // We might want to change LegalizeVectorOps to walk backwards through the + // nodes like LegalizeDAG? And share VSELECT legalization code with + // LegalizeDAG? + // + // Or would that cause problems with illegal types that we might have just + // introduced? + // + // Having a legal op with illegal types marked as Legal should work, with the + // expectation being that type legalization fixes it up later. + if (TLI.getOperationAction(ISD::VSELECT, StepVecVT) == TargetLowering::Expand) + ActiveElts = LegalizeOp(ActiveElts); + + SDValue HighestIdx = DAG.getNode(ISD::VECREDUCE_UMAX, DL, StepVT, ActiveElts); + + // Extract the corresponding lane from the data vector + EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout()); + SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, DL, ExtVT); + SDValue Extract = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Data, Idx); + + // If all mask lanes were inactive, choose the passthru value instead. + SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, DL, BoolVT, Mask); + return DAG.getSelect(DL, ScalarVT, AnyActive, Extract, PassThru); +} + void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node, SmallVectorImpl &Results) { // Attempt to expand using TargetLowering. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index f8d7c3ef7bbe7..d1c644b064718 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6426,43 +6426,18 @@ void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I, unsigned Intrinsic) { assert(Intrinsic == Intrinsic::experimental_vector_extract_last_active && "Tried lowering invalid vector extract last"); + SDLoc sdl = getCurSDLoc(); SDValue Data = getValue(I.getOperand(0)); SDValue Mask = getValue(I.getOperand(1)); SDValue PassThru = getValue(I.getOperand(2)); - EVT DataVT = Data.getValueType(); - EVT ScalarVT = PassThru.getValueType(); - EVT BoolVT = Mask.getValueType().getScalarType(); - - // Find a suitable type for a stepvector. - ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value. - if (DataVT.isScalableVector()) - VScaleRange = getVScaleRange(I.getCaller(), 64); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned EltWidth = TLI.getBitWidthForCttzElements( - I.getType(), DataVT.getVectorElementCount(), /*ZeroIsPoison=*/true, - &VScaleRange); - MVT StepVT = MVT::getIntegerVT(EltWidth); - EVT StepVecVT = DataVT.changeVectorElementType(StepVT); - - // Zero out lanes with inactive elements, then find the highest remaining - // value from the stepvector. - SDValue Zeroes = DAG.getConstant(0, sdl, StepVecVT); - SDValue StepVec = DAG.getStepVector(sdl, StepVecVT); - SDValue ActiveElts = DAG.getSelect(sdl, StepVecVT, Mask, StepVec, Zeroes); - SDValue HighestIdx = - DAG.getNode(ISD::VECREDUCE_UMAX, sdl, StepVT, ActiveElts); - - // Extract the corresponding lane from the data vector - EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout()); - SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, sdl, ExtVT); - SDValue Extract = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, Idx); - - // If all mask lanes were inactive, choose the passthru value instead. - SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask); - SDValue Result = DAG.getSelect(sdl, ScalarVT, AnyActive, Extract, PassThru); + EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + + SDValue Result = DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, sdl, ResultVT, + Data, Mask, PassThru); + setValue(&I, Result); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 580ff19065557..42cbb721703d9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -567,6 +567,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: return "histogram"; + case ISD::VECTOR_EXTRACT_LAST_ACTIVE: + return "extract_last_active"; + // Vector Predication #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \ case ISD::SDID: \ diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 3b0e9c7526fd0..cc822ad5ec50e 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -818,6 +818,9 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::SDOPC, VT, Expand); #include "llvm/IR/VPIntrinsics.def" + // Masked vector extracts default to expand. + setOperationAction(ISD::VECTOR_EXTRACT_LAST_ACTIVE, VT, Expand); + // FP environment operations default to expand. setOperationAction(ISD::GET_FPENV, VT, Expand); setOperationAction(ISD::SET_FPENV, VT, Expand); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 3ad2905ce5207..14fd7851bfa10 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -401,6 +401,16 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); } + // TODO: Should we include any other operations here? The calls to + // addDRType/addQRType below do mark VSELECT as Expand for the + // specified VTs, but leave other illegal types as the default + // of 'Legal'. LegalizeDAG doesn't legalize VSELECT after type + // legalization if LegalizeVectorOps introduces one. + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) + setOperationAction(ISD::VSELECT, VT, Expand); + for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) + setOperationAction(ISD::VSELECT, VT, Expand); + if (Subtarget->hasNEON()) { addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); diff --git a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll index 5212acc6fca0f..a0e9c6607042f 100644 --- a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll +++ b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll @@ -7,21 +7,26 @@ define i8 @extract_last_i8(<16 x i8> %data, <16 x i8> %mask, i8 %passthru) { ; NEON-FIXED: // %bb.0: ; NEON-FIXED-NEXT: sub sp, sp, #16 ; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 -; NEON-FIXED-NEXT: cmeq v2.16b, v1.16b, #0 +; NEON-FIXED-NEXT: cmtst v2.16b, v1.16b, v1.16b +; NEON-FIXED-NEXT: cmeq v1.16b, v1.16b, #0 ; NEON-FIXED-NEXT: adrp x8, .LCPI0_0 -; NEON-FIXED-NEXT: cmtst v1.16b, v1.16b, v1.16b -; NEON-FIXED-NEXT: ldr q3, [x8, :lo12:.LCPI0_0] -; NEON-FIXED-NEXT: mov x9, sp +; NEON-FIXED-NEXT: ldr q4, [x8, :lo12:.LCPI0_0] +; NEON-FIXED-NEXT: mov x11, sp ; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: bic v2.16b, v3.16b, v2.16b +; NEON-FIXED-NEXT: bic v1.16b, v4.16b, v1.16b +; NEON-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8 ; NEON-FIXED-NEXT: umaxv b1, v1.16b -; NEON-FIXED-NEXT: umaxv b2, v2.16b -; NEON-FIXED-NEXT: fmov w8, s2 -; NEON-FIXED-NEXT: bfxil x9, x8, #0, #4 -; NEON-FIXED-NEXT: ldrb w8, [x9] -; NEON-FIXED-NEXT: fmov w9, s1 -; NEON-FIXED-NEXT: tst w9, #0x1 -; NEON-FIXED-NEXT: csel w0, w8, w0, ne +; NEON-FIXED-NEXT: orr v2.8b, v2.8b, v3.8b +; NEON-FIXED-NEXT: fmov w10, s1 +; NEON-FIXED-NEXT: fmov x8, d2 +; NEON-FIXED-NEXT: bfxil x11, x10, #0, #4 +; NEON-FIXED-NEXT: orr x8, x8, x8, lsr #32 +; NEON-FIXED-NEXT: lsr x9, x8, #16 +; NEON-FIXED-NEXT: orr w8, w8, w9 +; NEON-FIXED-NEXT: ldrb w9, [x11] +; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #8 +; NEON-FIXED-NEXT: tst w8, #0xff +; NEON-FIXED-NEXT: csel w0, w9, w0, ne ; NEON-FIXED-NEXT: add sp, sp, #16 ; NEON-FIXED-NEXT: ret ; @@ -29,20 +34,25 @@ define i8 @extract_last_i8(<16 x i8> %data, <16 x i8> %mask, i8 %passthru) { ; SVE-FIXED: // %bb.0: ; SVE-FIXED-NEXT: sub sp, sp, #16 ; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 -; SVE-FIXED-NEXT: index z2.b, #0, #1 -; SVE-FIXED-NEXT: cmeq v3.16b, v1.16b, #0 -; SVE-FIXED-NEXT: cmtst v1.16b, v1.16b, v1.16b -; SVE-FIXED-NEXT: mov x9, sp +; SVE-FIXED-NEXT: cmtst v2.16b, v1.16b, v1.16b +; SVE-FIXED-NEXT: index z4.b, #0, #1 +; SVE-FIXED-NEXT: cmeq v1.16b, v1.16b, #0 +; SVE-FIXED-NEXT: mov x11, sp ; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: bic v2.16b, v2.16b, v3.16b +; SVE-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; SVE-FIXED-NEXT: bic v1.16b, v4.16b, v1.16b ; SVE-FIXED-NEXT: umaxv b1, v1.16b -; SVE-FIXED-NEXT: umaxv b2, v2.16b -; SVE-FIXED-NEXT: fmov w8, s2 -; SVE-FIXED-NEXT: bfxil x9, x8, #0, #4 -; SVE-FIXED-NEXT: ldrb w8, [x9] -; SVE-FIXED-NEXT: fmov w9, s1 -; SVE-FIXED-NEXT: tst w9, #0x1 -; SVE-FIXED-NEXT: csel w0, w8, w0, ne +; SVE-FIXED-NEXT: orr v2.8b, v2.8b, v3.8b +; SVE-FIXED-NEXT: fmov x8, d2 +; SVE-FIXED-NEXT: fmov w10, s1 +; SVE-FIXED-NEXT: orr x8, x8, x8, lsr #32 +; SVE-FIXED-NEXT: bfxil x11, x10, #0, #4 +; SVE-FIXED-NEXT: lsr x9, x8, #16 +; SVE-FIXED-NEXT: orr w8, w8, w9 +; SVE-FIXED-NEXT: ldrb w9, [x11] +; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #8 +; SVE-FIXED-NEXT: tst w8, #0xff +; SVE-FIXED-NEXT: csel w0, w9, w0, ne ; SVE-FIXED-NEXT: add sp, sp, #16 ; SVE-FIXED-NEXT: ret %notzero = icmp ne <16 x i8> %mask, zeroinitializer @@ -57,19 +67,22 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) { ; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 ; NEON-FIXED-NEXT: cmtst v1.8h, v1.8h, v1.8h ; NEON-FIXED-NEXT: adrp x8, .LCPI1_0 -; NEON-FIXED-NEXT: mov x9, sp -; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI1_0] +; NEON-FIXED-NEXT: mov x11, sp +; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI1_0] ; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: xtn v2.8b, v1.8h -; NEON-FIXED-NEXT: umaxv h1, v1.8h -; NEON-FIXED-NEXT: and v2.8b, v2.8b, v3.8b -; NEON-FIXED-NEXT: umaxv b2, v2.8b -; NEON-FIXED-NEXT: fmov w8, s2 -; NEON-FIXED-NEXT: bfi x9, x8, #1, #3 -; NEON-FIXED-NEXT: ldrh w8, [x9] -; NEON-FIXED-NEXT: fmov w9, s1 -; NEON-FIXED-NEXT: tst w9, #0x1 -; NEON-FIXED-NEXT: csel w0, w8, w0, ne +; NEON-FIXED-NEXT: xtn v1.8b, v1.8h +; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b +; NEON-FIXED-NEXT: fmov x8, d1 +; NEON-FIXED-NEXT: umaxv b1, v2.8b +; NEON-FIXED-NEXT: orr x8, x8, x8, lsr #32 +; NEON-FIXED-NEXT: lsr x9, x8, #16 +; NEON-FIXED-NEXT: fmov w10, s1 +; NEON-FIXED-NEXT: orr w8, w8, w9 +; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #8 +; NEON-FIXED-NEXT: bfi x11, x10, #1, #3 +; NEON-FIXED-NEXT: tst w8, #0xff +; NEON-FIXED-NEXT: ldrh w9, [x11] +; NEON-FIXED-NEXT: csel w0, w9, w0, ne ; NEON-FIXED-NEXT: add sp, sp, #16 ; NEON-FIXED-NEXT: ret ; @@ -78,19 +91,22 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) { ; SVE-FIXED-NEXT: sub sp, sp, #16 ; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 ; SVE-FIXED-NEXT: cmtst v1.8h, v1.8h, v1.8h -; SVE-FIXED-NEXT: index z3.b, #0, #1 -; SVE-FIXED-NEXT: mov x9, sp +; SVE-FIXED-NEXT: index z2.b, #0, #1 +; SVE-FIXED-NEXT: mov x11, sp ; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: xtn v2.8b, v1.8h -; SVE-FIXED-NEXT: umaxv h1, v1.8h -; SVE-FIXED-NEXT: and v2.8b, v2.8b, v3.8b -; SVE-FIXED-NEXT: umaxv b2, v2.8b -; SVE-FIXED-NEXT: fmov w8, s2 -; SVE-FIXED-NEXT: bfi x9, x8, #1, #3 -; SVE-FIXED-NEXT: ldrh w8, [x9] -; SVE-FIXED-NEXT: fmov w9, s1 -; SVE-FIXED-NEXT: tst w9, #0x1 -; SVE-FIXED-NEXT: csel w0, w8, w0, ne +; SVE-FIXED-NEXT: xtn v1.8b, v1.8h +; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b +; SVE-FIXED-NEXT: fmov x8, d1 +; SVE-FIXED-NEXT: umaxv b1, v2.8b +; SVE-FIXED-NEXT: orr x8, x8, x8, lsr #32 +; SVE-FIXED-NEXT: lsr x9, x8, #16 +; SVE-FIXED-NEXT: fmov w10, s1 +; SVE-FIXED-NEXT: orr w8, w8, w9 +; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #8 +; SVE-FIXED-NEXT: bfi x11, x10, #1, #3 +; SVE-FIXED-NEXT: tst w8, #0xff +; SVE-FIXED-NEXT: ldrh w9, [x11] +; SVE-FIXED-NEXT: csel w0, w9, w0, ne ; SVE-FIXED-NEXT: add sp, sp, #16 ; SVE-FIXED-NEXT: ret %notzero = icmp ne <8 x i16> %mask, zeroinitializer @@ -105,19 +121,21 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) { ; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 ; NEON-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s ; NEON-FIXED-NEXT: adrp x8, .LCPI2_0 -; NEON-FIXED-NEXT: mov x9, sp -; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI2_0] +; NEON-FIXED-NEXT: mov x11, sp +; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI2_0] ; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: xtn v2.4h, v1.4s -; NEON-FIXED-NEXT: umaxv s1, v1.4s -; NEON-FIXED-NEXT: and v2.8b, v2.8b, v3.8b +; NEON-FIXED-NEXT: xtn v1.4h, v1.4s +; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b +; NEON-FIXED-NEXT: fmov x8, d1 ; NEON-FIXED-NEXT: umaxv h2, v2.4h -; NEON-FIXED-NEXT: fmov w8, s2 -; NEON-FIXED-NEXT: bfi x9, x8, #2, #2 -; NEON-FIXED-NEXT: ldr w8, [x9] -; NEON-FIXED-NEXT: fmov w9, s1 -; NEON-FIXED-NEXT: tst w9, #0x1 -; NEON-FIXED-NEXT: csel w0, w8, w0, ne +; NEON-FIXED-NEXT: lsr x9, x8, #32 +; NEON-FIXED-NEXT: orr w8, w8, w9 +; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #16 +; NEON-FIXED-NEXT: fmov w10, s2 +; NEON-FIXED-NEXT: tst w8, #0xffff +; NEON-FIXED-NEXT: bfi x11, x10, #2, #2 +; NEON-FIXED-NEXT: ldr w9, [x11] +; NEON-FIXED-NEXT: csel w0, w9, w0, ne ; NEON-FIXED-NEXT: add sp, sp, #16 ; NEON-FIXED-NEXT: ret ; @@ -126,19 +144,21 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) { ; SVE-FIXED-NEXT: sub sp, sp, #16 ; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 ; SVE-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s -; SVE-FIXED-NEXT: index z3.h, #0, #1 -; SVE-FIXED-NEXT: mov x9, sp +; SVE-FIXED-NEXT: index z2.h, #0, #1 +; SVE-FIXED-NEXT: mov x11, sp ; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: xtn v2.4h, v1.4s -; SVE-FIXED-NEXT: umaxv s1, v1.4s -; SVE-FIXED-NEXT: and v2.8b, v2.8b, v3.8b +; SVE-FIXED-NEXT: xtn v1.4h, v1.4s +; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b +; SVE-FIXED-NEXT: fmov x8, d1 ; SVE-FIXED-NEXT: umaxv h2, v2.4h -; SVE-FIXED-NEXT: fmov w8, s2 -; SVE-FIXED-NEXT: bfi x9, x8, #2, #2 -; SVE-FIXED-NEXT: ldr w8, [x9] -; SVE-FIXED-NEXT: fmov w9, s1 -; SVE-FIXED-NEXT: tst w9, #0x1 -; SVE-FIXED-NEXT: csel w0, w8, w0, ne +; SVE-FIXED-NEXT: lsr x9, x8, #32 +; SVE-FIXED-NEXT: orr w8, w8, w9 +; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #16 +; SVE-FIXED-NEXT: fmov w10, s2 +; SVE-FIXED-NEXT: tst w8, #0xffff +; SVE-FIXED-NEXT: bfi x11, x10, #2, #2 +; SVE-FIXED-NEXT: ldr w9, [x11] +; SVE-FIXED-NEXT: csel w0, w9, w0, ne ; SVE-FIXED-NEXT: add sp, sp, #16 ; SVE-FIXED-NEXT: ret %notzero = icmp ne <4 x i32> %mask, zeroinitializer @@ -153,19 +173,20 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) { ; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 ; NEON-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d ; NEON-FIXED-NEXT: adrp x8, .LCPI3_0 -; NEON-FIXED-NEXT: mov x9, sp -; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI3_0] +; NEON-FIXED-NEXT: mov x10, sp +; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI3_0] ; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: xtn v2.2s, v1.2d -; NEON-FIXED-NEXT: umaxv s1, v1.4s -; NEON-FIXED-NEXT: and v2.8b, v2.8b, v3.8b +; NEON-FIXED-NEXT: xtn v1.2s, v1.2d +; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b +; NEON-FIXED-NEXT: fmov x8, d1 ; NEON-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s -; NEON-FIXED-NEXT: fmov w8, s2 -; NEON-FIXED-NEXT: bfi x9, x8, #3, #1 -; NEON-FIXED-NEXT: ldr x8, [x9] -; NEON-FIXED-NEXT: fmov w9, s1 -; NEON-FIXED-NEXT: tst w9, #0x1 -; NEON-FIXED-NEXT: csel x0, x8, x0, ne +; NEON-FIXED-NEXT: fmov w9, s2 +; NEON-FIXED-NEXT: bfi x10, x9, #3, #1 +; NEON-FIXED-NEXT: lsr x9, x8, #32 +; NEON-FIXED-NEXT: ldr x10, [x10] +; NEON-FIXED-NEXT: orr w8, w8, w9 +; NEON-FIXED-NEXT: cmp w8, #0 +; NEON-FIXED-NEXT: csel x0, x10, x0, ne ; NEON-FIXED-NEXT: add sp, sp, #16 ; NEON-FIXED-NEXT: ret ; @@ -174,19 +195,20 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) { ; SVE-FIXED-NEXT: sub sp, sp, #16 ; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 ; SVE-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d -; SVE-FIXED-NEXT: index z3.s, #0, #1 -; SVE-FIXED-NEXT: mov x9, sp +; SVE-FIXED-NEXT: index z2.s, #0, #1 +; SVE-FIXED-NEXT: mov x10, sp ; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: xtn v2.2s, v1.2d -; SVE-FIXED-NEXT: umaxv s1, v1.4s -; SVE-FIXED-NEXT: and v2.8b, v2.8b, v3.8b +; SVE-FIXED-NEXT: xtn v1.2s, v1.2d +; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b +; SVE-FIXED-NEXT: fmov x8, d1 ; SVE-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s -; SVE-FIXED-NEXT: fmov w8, s2 -; SVE-FIXED-NEXT: bfi x9, x8, #3, #1 -; SVE-FIXED-NEXT: ldr x8, [x9] -; SVE-FIXED-NEXT: fmov w9, s1 -; SVE-FIXED-NEXT: tst w9, #0x1 -; SVE-FIXED-NEXT: csel x0, x8, x0, ne +; SVE-FIXED-NEXT: fmov w9, s2 +; SVE-FIXED-NEXT: bfi x10, x9, #3, #1 +; SVE-FIXED-NEXT: lsr x9, x8, #32 +; SVE-FIXED-NEXT: ldr x10, [x10] +; SVE-FIXED-NEXT: orr w8, w8, w9 +; SVE-FIXED-NEXT: cmp w8, #0 +; SVE-FIXED-NEXT: csel x0, x10, x0, ne ; SVE-FIXED-NEXT: add sp, sp, #16 ; SVE-FIXED-NEXT: ret %notzero = icmp ne <2 x i64> %mask, zeroinitializer @@ -201,18 +223,20 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass ; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 ; NEON-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s ; NEON-FIXED-NEXT: adrp x8, .LCPI4_0 -; NEON-FIXED-NEXT: mov x9, sp -; NEON-FIXED-NEXT: ldr d4, [x8, :lo12:.LCPI4_0] +; NEON-FIXED-NEXT: mov x11, sp +; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI4_0] ; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: xtn v3.4h, v1.4s -; NEON-FIXED-NEXT: umaxv s1, v1.4s -; NEON-FIXED-NEXT: and v3.8b, v3.8b, v4.8b +; NEON-FIXED-NEXT: xtn v1.4h, v1.4s +; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b +; NEON-FIXED-NEXT: fmov x8, d1 ; NEON-FIXED-NEXT: umaxv h3, v3.4h -; NEON-FIXED-NEXT: fmov w8, s3 -; NEON-FIXED-NEXT: bfi x9, x8, #2, #2 -; NEON-FIXED-NEXT: fmov w8, s1 -; NEON-FIXED-NEXT: ldr s0, [x9] -; NEON-FIXED-NEXT: tst w8, #0x1 +; NEON-FIXED-NEXT: lsr x9, x8, #32 +; NEON-FIXED-NEXT: orr w8, w8, w9 +; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #16 +; NEON-FIXED-NEXT: fmov w10, s3 +; NEON-FIXED-NEXT: tst w8, #0xffff +; NEON-FIXED-NEXT: bfi x11, x10, #2, #2 +; NEON-FIXED-NEXT: ldr s0, [x11] ; NEON-FIXED-NEXT: fcsel s0, s0, s2, ne ; NEON-FIXED-NEXT: add sp, sp, #16 ; NEON-FIXED-NEXT: ret @@ -222,18 +246,20 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass ; SVE-FIXED-NEXT: sub sp, sp, #16 ; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 ; SVE-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s -; SVE-FIXED-NEXT: index z4.h, #0, #1 -; SVE-FIXED-NEXT: mov x9, sp +; SVE-FIXED-NEXT: index z3.h, #0, #1 +; SVE-FIXED-NEXT: mov x11, sp ; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: xtn v3.4h, v1.4s -; SVE-FIXED-NEXT: umaxv s1, v1.4s -; SVE-FIXED-NEXT: and v3.8b, v3.8b, v4.8b +; SVE-FIXED-NEXT: xtn v1.4h, v1.4s +; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b +; SVE-FIXED-NEXT: fmov x8, d1 ; SVE-FIXED-NEXT: umaxv h3, v3.4h -; SVE-FIXED-NEXT: fmov w8, s3 -; SVE-FIXED-NEXT: bfi x9, x8, #2, #2 -; SVE-FIXED-NEXT: fmov w8, s1 -; SVE-FIXED-NEXT: ldr s0, [x9] -; SVE-FIXED-NEXT: tst w8, #0x1 +; SVE-FIXED-NEXT: lsr x9, x8, #32 +; SVE-FIXED-NEXT: orr w8, w8, w9 +; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #16 +; SVE-FIXED-NEXT: fmov w10, s3 +; SVE-FIXED-NEXT: tst w8, #0xffff +; SVE-FIXED-NEXT: bfi x11, x10, #2, #2 +; SVE-FIXED-NEXT: ldr s0, [x11] ; SVE-FIXED-NEXT: fcsel s0, s0, s2, ne ; SVE-FIXED-NEXT: add sp, sp, #16 ; SVE-FIXED-NEXT: ret @@ -249,18 +275,19 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double % ; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 ; NEON-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d ; NEON-FIXED-NEXT: adrp x8, .LCPI5_0 -; NEON-FIXED-NEXT: mov x9, sp -; NEON-FIXED-NEXT: ldr d4, [x8, :lo12:.LCPI5_0] +; NEON-FIXED-NEXT: mov x10, sp +; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI5_0] ; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: xtn v3.2s, v1.2d -; NEON-FIXED-NEXT: umaxv s1, v1.4s -; NEON-FIXED-NEXT: and v3.8b, v3.8b, v4.8b +; NEON-FIXED-NEXT: xtn v1.2s, v1.2d +; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b +; NEON-FIXED-NEXT: fmov x8, d1 ; NEON-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s -; NEON-FIXED-NEXT: fmov w8, s3 -; NEON-FIXED-NEXT: bfi x9, x8, #3, #1 -; NEON-FIXED-NEXT: fmov w8, s1 -; NEON-FIXED-NEXT: ldr d0, [x9] -; NEON-FIXED-NEXT: tst w8, #0x1 +; NEON-FIXED-NEXT: fmov w9, s3 +; NEON-FIXED-NEXT: bfi x10, x9, #3, #1 +; NEON-FIXED-NEXT: lsr x9, x8, #32 +; NEON-FIXED-NEXT: ldr d0, [x10] +; NEON-FIXED-NEXT: orr w8, w8, w9 +; NEON-FIXED-NEXT: cmp w8, #0 ; NEON-FIXED-NEXT: fcsel d0, d0, d2, ne ; NEON-FIXED-NEXT: add sp, sp, #16 ; NEON-FIXED-NEXT: ret @@ -270,18 +297,19 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double % ; SVE-FIXED-NEXT: sub sp, sp, #16 ; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 ; SVE-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d -; SVE-FIXED-NEXT: index z4.s, #0, #1 -; SVE-FIXED-NEXT: mov x9, sp +; SVE-FIXED-NEXT: index z3.s, #0, #1 +; SVE-FIXED-NEXT: mov x10, sp ; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: xtn v3.2s, v1.2d -; SVE-FIXED-NEXT: umaxv s1, v1.4s -; SVE-FIXED-NEXT: and v3.8b, v3.8b, v4.8b +; SVE-FIXED-NEXT: xtn v1.2s, v1.2d +; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b +; SVE-FIXED-NEXT: fmov x8, d1 ; SVE-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s -; SVE-FIXED-NEXT: fmov w8, s3 -; SVE-FIXED-NEXT: bfi x9, x8, #3, #1 -; SVE-FIXED-NEXT: fmov w8, s1 -; SVE-FIXED-NEXT: ldr d0, [x9] -; SVE-FIXED-NEXT: tst w8, #0x1 +; SVE-FIXED-NEXT: fmov w9, s3 +; SVE-FIXED-NEXT: bfi x10, x9, #3, #1 +; SVE-FIXED-NEXT: lsr x9, x8, #32 +; SVE-FIXED-NEXT: ldr d0, [x10] +; SVE-FIXED-NEXT: orr w8, w8, w9 +; SVE-FIXED-NEXT: cmp w8, #0 ; SVE-FIXED-NEXT: fcsel d0, d0, d2, ne ; SVE-FIXED-NEXT: add sp, sp, #16 ; SVE-FIXED-NEXT: ret @@ -318,7 +346,7 @@ define i16 @extract_last_i16_scalable( %data, %data, %data, %data, %data, %data, <2 x i64> %mask, i64 %passthru) { ; RV32-LABEL: extract_last_i64: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vmsne.vi v0, v9, 0 +; RV32-NEXT: vmsne.vi v9, v9, 0 +; RV32-NEXT: vmv.v.i v0, 1 +; RV32-NEXT: vslidedown.vi v10, v8, 1 +; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmv1r.v v11, v10 +; RV32-NEXT: vcpop.m a2, v9 +; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV32-NEXT: vrgather.vi v11, v8, 1, v0.t +; RV32-NEXT: vmv1r.v v0, v9 ; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; RV32-NEXT: vmv.v.i v9, 0 -; RV32-NEXT: vcpop.m a2, v0 -; RV32-NEXT: vid.v v9, v0.t +; RV32-NEXT: vid.v v12, v0.t ; RV32-NEXT: beqz a2, .LBB3_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: vredmaxu.vs v9, v9, v9 -; RV32-NEXT: li a1, 32 +; RV32-NEXT: vredmaxu.vs v9, v12, v12 +; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; RV32-NEXT: vslideup.vi v8, v10, 1 +; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; RV32-NEXT: vmv.x.s a0, v9 ; RV32-NEXT: andi a0, a0, 255 -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; RV32-NEXT: vslidedown.vx v9, v11, a0 ; RV32-NEXT: vslidedown.vx v8, v8, a0 +; RV32-NEXT: vmv.x.s a1, v9 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v8, v8, a1 -; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: .LBB3_2: ; RV32-NEXT: ret ; @@ -168,22 +176,39 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double % } define i8 @extract_last_i8_scalable( %data, %mask, i8 %passthru) { -; CHECK-LABEL: extract_last_i8_scalable: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, mu -; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: vcpop.m a1, v0 -; CHECK-NEXT: vid.v v10, v0.t -; CHECK-NEXT: beqz a1, .LBB6_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: vredmaxu.vs v10, v10, v10 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: andi a0, a0, 255 -; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: .LBB6_2: -; CHECK-NEXT: ret +; RV32-LABEL: extract_last_i8_scalable: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, mu +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vid.v v16, v0.t +; RV32-NEXT: beqz a1, .LBB6_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: vredmaxu.vs v10, v16, v16 +; RV32-NEXT: vmv.x.s a0, v10 +; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; RV32-NEXT: vslidedown.vx v8, v8, a0 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: .LBB6_2: +; RV32-NEXT: ret +; +; RV64-LABEL: extract_last_i8_scalable: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, mu +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vid.v v16, v0.t +; RV64-NEXT: beqz a1, .LBB6_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: vredmaxu.vs v10, v16, v16 +; RV64-NEXT: vmv.x.s a0, v10 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: srli a0, a0, 32 +; RV64-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; RV64-NEXT: vslidedown.vx v8, v8, a0 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: .LBB6_2: +; RV64-NEXT: ret %res = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8( %data, %mask, i8 %passthru) ret i8 %res } @@ -191,16 +216,14 @@ define i8 @extract_last_i8_scalable( %data, define i16 @extract_last_i16_scalable( %data, %mask, i16 %passthru) { ; RV32-LABEL: extract_last_i16_scalable: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e16, m2, ta, mu -; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu +; RV32-NEXT: vmv.v.i v12, 0 ; RV32-NEXT: vcpop.m a1, v0 -; RV32-NEXT: vid.v v10, v0.t +; RV32-NEXT: vid.v v12, v0.t ; RV32-NEXT: beqz a1, .LBB7_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: vredmaxu.vs v10, v10, v10 +; RV32-NEXT: vredmaxu.vs v10, v12, v12 ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: slli a0, a0, 16 -; RV32-NEXT: srli a0, a0, 16 ; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV32-NEXT: vslidedown.vx v8, v8, a0 ; RV32-NEXT: vmv.x.s a0, v8 @@ -209,16 +232,16 @@ define i16 @extract_last_i16_scalable( %data, %data, %data, %mask, i64 %passthru) { ; RV32-LABEL: extract_last_i64_scalable: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, mu +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu ; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: vcpop.m a2, v0 ; RV32-NEXT: vid.v v10, v0.t ; RV32-NEXT: beqz a2, .LBB9_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: vredmaxu.vs v10, v10, v10 -; RV32-NEXT: li a1, 32 ; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32-NEXT: vslidedown.vx v8, v8, a0 +; RV32-NEXT: vnsrl.wi v10, v8, 0 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vnsrl.wx v11, v8, a1 +; RV32-NEXT: vslidedown.vx v8, v10, a0 +; RV32-NEXT: vslidedown.vx v9, v11, a0 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: vsrl.vx v8, v8, a1 -; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: vmv.x.s a1, v9 ; RV32-NEXT: .LBB9_2: ; RV32-NEXT: ret ; ; RV64-LABEL: extract_last_i64_scalable: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, mu +; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, mu ; RV64-NEXT: vmv.v.i v10, 0 ; RV64-NEXT: vcpop.m a1, v0 ; RV64-NEXT: vid.v v10, v0.t @@ -297,6 +320,8 @@ define i64 @extract_last_i64_scalable( %data, %data, %data, %mask, double %passthru) { -; CHECK-LABEL: extract_last_double_scalable: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: vcpop.m a0, v0 -; CHECK-NEXT: vid.v v10, v0.t -; CHECK-NEXT: beqz a0, .LBB11_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: vredmaxu.vs v10, v10, v10 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB11_2: -; CHECK-NEXT: ret +; RV32-LABEL: extract_last_double_scalable: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vcpop.m a0, v0 +; RV32-NEXT: vid.v v10, v0.t +; RV32-NEXT: beqz a0, .LBB11_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: vredmaxu.vs v10, v10, v10 +; RV32-NEXT: vmv.x.s a0, v10 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vslidedown.vx v8, v8, a0 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: .LBB11_2: +; RV32-NEXT: ret +; +; RV64-LABEL: extract_last_double_scalable: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vcpop.m a0, v0 +; RV64-NEXT: vid.v v10, v0.t +; RV64-NEXT: beqz a0, .LBB11_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: vredmaxu.vs v10, v10, v10 +; RV64-NEXT: vmv.x.s a0, v10 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: srli a0, a0, 32 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vslidedown.vx v8, v8, a0 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: .LBB11_2: +; RV64-NEXT: ret %res = call double @llvm.experimental.vector.extract.last.active.nxv2f64( %data, %mask, double %passthru) ret double %res } From 3d9358d89bbad957af1f32ede2468385e08a0620 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Wed, 8 Jan 2025 14:31:02 +0000 Subject: [PATCH 2/3] Split up selectiondag representation into 3 parts. --- llvm/include/llvm/CodeGen/ISDOpcodes.h | 6 +- llvm/include/llvm/CodeGen/TargetLowering.h | 5 + .../SelectionDAG/LegalizeIntegerTypes.cpp | 22 +- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 6 +- .../SelectionDAG/LegalizeTypesGeneric.cpp | 61 ---- .../SelectionDAG/LegalizeVectorOps.cpp | 82 +---- .../SelectionDAG/SelectionDAGBuilder.cpp | 18 +- .../SelectionDAG/SelectionDAGDumper.cpp | 4 +- .../CodeGen/SelectionDAG/TargetLowering.cpp | 46 +++ llvm/lib/CodeGen/TargetLoweringBase.cpp | 2 +- .../Target/AArch64/AArch64ISelLowering.cpp | 10 - .../AArch64/vector-extract-last-active.ll | 310 +++++++++--------- .../RISCV/rvv/vector-extract-last-active.ll | 293 ++++++----------- 13 files changed, 334 insertions(+), 531 deletions(-) diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index d2ed8ec2e7466..fd8784a4c1003 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1480,9 +1480,9 @@ enum NodeType { // Output: Output Chain EXPERIMENTAL_VECTOR_HISTOGRAM, - // experimental.vector.extract.last.active intrinsic - // Operands: Data, Mask, PassThru - VECTOR_EXTRACT_LAST_ACTIVE, + // Finds the index of the last active mask element + // Operands: Mask + VECTOR_FIND_LAST_ACTIVE, // llvm.clear_cache intrinsic // Operands: Input Chain, Start Addres, End Address diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 3751aac4df8ea..6edc750ea722d 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5368,6 +5368,11 @@ class TargetLowering : public TargetLoweringBase { /// \returns The expansion result or SDValue() if it fails. SDValue expandVPCTTZElements(SDNode *N, SelectionDAG &DAG) const; + /// Expand VECTOR_FIND_LAST_ACTIVE nodes + /// \param N Node to expand + /// \returns The expansion result or SDValue() if it fails. + SDValue expandVectorFindLastActive(SDNode *N, SelectionDAG &DAG) const; + /// Expand ABS nodes. Expands vector/scalar ABS nodes, /// vector nodes can only succeed if all operations are legal/custom. /// (ABS x) -> (XOR (ADD x, (SRA x, type_size)), (SRA x, type_size)) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 6d75f0788203f..c519603fae9a2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -155,8 +155,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::ZERO_EXTEND_VECTOR_INREG: Res = PromoteIntRes_EXTEND_VECTOR_INREG(N); break; - case ISD::VECTOR_EXTRACT_LAST_ACTIVE: - Res = PromoteIntRes_VECTOR_EXTRACT_LAST_ACTIVE(N); + case ISD::VECTOR_FIND_LAST_ACTIVE: + Res = PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(N); break; case ISD::SIGN_EXTEND: @@ -2073,8 +2073,8 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: Res = PromoteIntOp_VECTOR_HISTOGRAM(N, OpNo); break; - case ISD::VECTOR_EXTRACT_LAST_ACTIVE: - Res = PromoteIntOp_VECTOR_EXTRACT_LAST_ACTIVE(N, OpNo); + case ISD::VECTOR_FIND_LAST_ACTIVE: + Res = PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(N, OpNo); break; } @@ -2817,10 +2817,9 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } -SDValue -DAGTypeLegalizer::PromoteIntOp_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N, - unsigned OpNo) { - SmallVector NewOps(N->ops()); +SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, + unsigned OpNo) { + SmallVector NewOps(N->ops()); NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo)); return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } @@ -2863,9 +2862,6 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::BUILD_PAIR: ExpandRes_BUILD_PAIR(N, Lo, Hi); break; case ISD::EXTRACT_ELEMENT: ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break; case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break; - case ISD::VECTOR_EXTRACT_LAST_ACTIVE: - ExpandRes_VECTOR_EXTRACT_LAST_ACTIVE(N, Lo, Hi); - break; case ISD::VAARG: ExpandRes_VAARG(N, Lo, Hi); break; case ISD::ANY_EXTEND: ExpandIntRes_ANY_EXTEND(N, Lo, Hi); break; @@ -6142,10 +6138,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTEND_VECTOR_INREG(SDNode *N) { return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0)); } -SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N) { +SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N) { EVT VT = N->getValueType(0); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); - return DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, SDLoc(N), NVT, N->ops()); + return DAG.getNode(ISD::VECTOR_FIND_LAST_ACTIVE, SDLoc(N), NVT, N->ops()); } SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 0fc51c33d5f18..069e191d10d7d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -378,7 +378,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntRes_VPFunnelShift(SDNode *N); SDValue PromoteIntRes_IS_FPCLASS(SDNode *N); SDValue PromoteIntRes_PATCHPOINT(SDNode *N); - SDValue PromoteIntRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N); + SDValue PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N); // Integer Operand Promotion. bool PromoteIntegerOperand(SDNode *N, unsigned OpNo); @@ -429,7 +429,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, unsigned OpNo); - SDValue PromoteIntOp_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N, unsigned OpNo); + SDValue PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, unsigned OpNo); void SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -1217,8 +1217,6 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void ExpandRes_BUILD_PAIR (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandRes_EXTRACT_ELEMENT (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); - void ExpandRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N, SDValue &Lo, - SDValue &Hi); void ExpandRes_NormalLoad (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandRes_VAARG (SDNode *N, SDValue &Lo, SDValue &Hi); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index f7d4800487d60..113a3bc0bbea6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -19,7 +19,6 @@ //===----------------------------------------------------------------------===// #include "LegalizeTypes.h" -#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DataLayout.h" using namespace llvm; @@ -245,66 +244,6 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo, std::swap(Lo, Hi); } -void DAGTypeLegalizer::ExpandRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N, - SDValue &Lo, - SDValue &Hi) { - SDValue Data = N->getOperand(0); - SDValue Mask = N->getOperand(1); - SDValue PassThru = N->getOperand(2); - - ElementCount OldEltCount = Data.getValueType().getVectorElementCount(); - EVT OldEltVT = Data.getValueType().getVectorElementType(); - SDLoc dl(N); - - EVT OldVT = N->getValueType(0); - EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT); - - if (OldVT != OldEltVT) { - // The result of EXTRACT_LAST_ACTIVE may be larger than the element type of - // the input vector. If so, extend the elements of the input vector to the - // same bitwidth as the result before expanding. - assert(OldEltVT.bitsLT(OldVT) && "Result type smaller then element type!"); - EVT NVecVT = EVT::getVectorVT(*DAG.getContext(), OldVT, OldEltCount); - Data = DAG.getNode(ISD::ANY_EXTEND, dl, NVecVT, N->getOperand(0)); - } - - SDValue NewVec = DAG.getNode( - ISD::BITCAST, dl, - EVT::getVectorVT(*DAG.getContext(), NewVT, OldEltCount * 2), Data); - - auto [DataLo, DataHi] = DAG.SplitVector(NewVec, dl); - auto [PassLo, PassHi] = DAG.SplitScalar(PassThru, dl, NewVT, NewVT); - - EVT SplitVT = DataLo.getValueType(); - - // TODO: I *think* this works correctly, but I haven't confirmed it yet by - // actually running a compiled program with example data. - // - // We want the matching lo and hi parts from whichever lane was the last - // active. - SDValue Deinterleaved; - if (SplitVT.isFixedLengthVector()) { - unsigned SplitNum = SplitVT.getVectorMinNumElements(); - SDValue Even = DAG.getVectorShuffle(SplitVT, dl, DataLo, DataHi, - createStrideMask(0, 2, SplitNum)); - SDValue Odd = DAG.getVectorShuffle(SplitVT, dl, DataLo, DataHi, - createStrideMask(1, 2, SplitNum)); - Deinterleaved = DAG.getMergeValues({Even, Odd}, dl); - } else - Deinterleaved = - DAG.getNode(ISD::VECTOR_DEINTERLEAVE, dl, - DAG.getVTList(SplitVT, SplitVT), DataLo, DataHi); - - Lo = DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, dl, NewVT, - Deinterleaved.getValue(0), Mask, PassLo); - Hi = DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, dl, NewVT, - Deinterleaved.getValue(1), Mask, PassHi); - - // FIXME: Endianness? - assert(!DAG.getDataLayout().isBigEndian() && - "Implement big endian result expansion for extract_last_active"); -} - void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo, SDValue &Hi) { assert(ISD::isNormalLoad(N) && "This routine only for normal loads!"); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 8026e5c845b41..607c70675c988 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -29,7 +29,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/SelectionDAG.h" @@ -139,7 +138,6 @@ class VectorLegalizer { SDValue ExpandVP_FNEG(SDNode *Node); SDValue ExpandVP_FABS(SDNode *Node); SDValue ExpandVP_FCOPYSIGN(SDNode *Node); - SDValue ExpandVECTOR_EXTRACT_LAST_ACTIVE(SDNode *Node); SDValue ExpandSELECT(SDNode *Node); std::pair ExpandLoad(SDNode *N); SDValue ExpandStore(SDNode *N); @@ -469,7 +467,6 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::VECTOR_COMPRESS: case ISD::SCMP: case ISD::UCMP: - case ISD::VECTOR_EXTRACT_LAST_ACTIVE: Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); break; case ISD::SMULFIX: @@ -506,6 +503,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::VECREDUCE_FMIN: case ISD::VECREDUCE_FMAXIMUM: case ISD::VECREDUCE_FMINIMUM: + case ISD::VECTOR_FIND_LAST_ACTIVE: Action = TLI.getOperationAction(Node->getOpcode(), Node->getOperand(0).getValueType()); break; @@ -1211,8 +1209,8 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { case ISD::VECTOR_COMPRESS: Results.push_back(TLI.expandVECTOR_COMPRESS(Node, DAG)); return; - case ISD::VECTOR_EXTRACT_LAST_ACTIVE: - Results.push_back(ExpandVECTOR_EXTRACT_LAST_ACTIVE(Node)); + case ISD::VECTOR_FIND_LAST_ACTIVE: + Results.push_back(TLI.expandVectorFindLastActive(Node, DAG)); return; case ISD::SCMP: case ISD::UCMP: @@ -1725,80 +1723,6 @@ SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) { return DAG.getNode(ISD::BITCAST, DL, VT, CopiedSign); } -SDValue VectorLegalizer::ExpandVECTOR_EXTRACT_LAST_ACTIVE(SDNode *Node) { - SDLoc DL(Node); - SDValue Data = Node->getOperand(0); - SDValue Mask = Node->getOperand(1); - SDValue PassThru = Node->getOperand(2); - - EVT DataVT = Data.getValueType(); - EVT ScalarVT = PassThru.getValueType(); - EVT BoolVT = Mask.getValueType().getScalarType(); - - // Find a suitable type for a stepvector. - ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value. - if (DataVT.isScalableVector()) - VScaleRange = getVScaleRange(&DAG.getMachineFunction().getFunction(), 64); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned EltWidth = TLI.getBitWidthForCttzElements( - ScalarVT.getTypeForEVT(*DAG.getContext()), DataVT.getVectorElementCount(), - /*ZeroIsPoison=*/true, &VScaleRange); - - // HACK: If the target selects a VT that's too wide based on the legal types - // for a vecreduce_umax, if will force expansion of the node -- which - // doesn't work on scalable vectors... - // Is there another method we could use to get a smaller VT instead - // of just capping to 32b? - EVT StepVT = MVT::getIntegerVT(std::min(EltWidth, 32u)); - EVT StepVecVT = DataVT.changeVectorElementType(StepVT); - - // HACK: If the target selects a VT that's too small to form a legal vector - // type, we also run into problems trying to expand the vecreduce_umax. - // - // I think perhaps we need to revisit how getBitWidthForCttzElements - // works... - if (TLI.getTypeAction(StepVecVT.getSimpleVT()) == - TargetLowering::TypePromoteInteger) { - StepVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), StepVecVT); - StepVT = StepVecVT.getVectorElementType(); - } - - // Zero out lanes with inactive elements, then find the highest remaining - // value from the stepvector. - SDValue Zeroes = DAG.getConstant(0, DL, StepVecVT); - SDValue StepVec = DAG.getStepVector(DL, StepVecVT); - SDValue ActiveElts = DAG.getSelect(DL, StepVecVT, Mask, StepVec, Zeroes); - - // HACK: Unfortunately, LegalizeVectorOps does not recursively legalize *all* - // added nodes, just the end result nodes until it finds legal ops. - // LegalizeDAG doesn't handle VSELECT at all presently. So if we need to - // legalize a vselect then we have to do it here. - // - // We might want to change LegalizeVectorOps to walk backwards through the - // nodes like LegalizeDAG? And share VSELECT legalization code with - // LegalizeDAG? - // - // Or would that cause problems with illegal types that we might have just - // introduced? - // - // Having a legal op with illegal types marked as Legal should work, with the - // expectation being that type legalization fixes it up later. - if (TLI.getOperationAction(ISD::VSELECT, StepVecVT) == TargetLowering::Expand) - ActiveElts = LegalizeOp(ActiveElts); - - SDValue HighestIdx = DAG.getNode(ISD::VECREDUCE_UMAX, DL, StepVT, ActiveElts); - - // Extract the corresponding lane from the data vector - EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout()); - SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, DL, ExtVT); - SDValue Extract = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Data, Idx); - - // If all mask lanes were inactive, choose the passthru value instead. - SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, DL, BoolVT, Mask); - return DAG.getSelect(DL, ScalarVT, AnyActive, Extract, PassThru); -} - void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node, SmallVectorImpl &Results) { // Attempt to expand using TargetLowering. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index d1c644b064718..abcc75c5a26ab 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6426,17 +6426,25 @@ void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I, unsigned Intrinsic) { assert(Intrinsic == Intrinsic::experimental_vector_extract_last_active && "Tried lowering invalid vector extract last"); - SDLoc sdl = getCurSDLoc(); + const DataLayout &Layout = DAG.getDataLayout(); SDValue Data = getValue(I.getOperand(0)); SDValue Mask = getValue(I.getOperand(1)); - SDValue PassThru = getValue(I.getOperand(2)); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + EVT ResVT = TLI.getValueType(Layout, I.getType()); - SDValue Result = DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, sdl, ResultVT, - Data, Mask, PassThru); + EVT ExtVT = TLI.getVectorIdxTy(Layout); + SDValue Idx = DAG.getNode(ISD::VECTOR_FIND_LAST_ACTIVE, sdl, ExtVT, Mask); + SDValue Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ResVT, Data, Idx); + + Value *Default = I.getOperand(2); + if (!isa(Default) && !isa(Default)) { + SDValue PassThru = getValue(Default); + EVT BoolVT = Mask.getValueType().getScalarType(); + SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask); + Result = DAG.getSelect(sdl, ResVT, AnyActive, Result, PassThru); + } setValue(&I, Result); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 42cbb721703d9..f63c8dd3df1c8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -567,8 +567,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: return "histogram"; - case ISD::VECTOR_EXTRACT_LAST_ACTIVE: - return "extract_last_active"; + case ISD::VECTOR_FIND_LAST_ACTIVE: + return "find_last_active"; // Vector Predication #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \ diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 9f57884eae04d..809948c8178c7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -12,6 +12,7 @@ #include "llvm/CodeGen/TargetLowering.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/CodeGenCommonISel.h" @@ -9453,6 +9454,51 @@ SDValue TargetLowering::expandVPCTTZElements(SDNode *N, return DAG.getNode(ISD::VP_REDUCE_UMIN, DL, ResVT, ExtEVL, Select, Mask, EVL); } +SDValue TargetLowering::expandVectorFindLastActive(SDNode *N, + SelectionDAG &DAG) const { + SDLoc DL(N); + SDValue Mask = N->getOperand(0); + EVT MaskVT = Mask.getValueType(); + EVT BoolVT = MaskVT.getScalarType(); + + // Find a suitable type for a stepvector. + ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value. + if (MaskVT.isScalableVector()) + VScaleRange = getVScaleRange(&DAG.getMachineFunction().getFunction(), 64); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned EltWidth = TLI.getBitWidthForCttzElements( + BoolVT.getTypeForEVT(*DAG.getContext()), MaskVT.getVectorElementCount(), + /*ZeroIsPoison=*/true, &VScaleRange); + + // FIXME: If the target selects a VT that's too wide based on the legal types + // for a vecreduce_umax, if will force expansion of the node -- which + // doesn't work on scalable vectors... + // Is there another method we could use to get a smaller VT instead + // of just capping to 32b? + EVT StepVT = MVT::getIntegerVT(std::min(EltWidth, 32u)); + EVT StepVecVT = MaskVT.changeVectorElementType(StepVT); + + // FIXME: If the target selects a VT that's too small to form a legal vector + // type, we also run into problems if expanding after type + // legalization. + // + // I think perhaps we need to revisit how getBitWidthForCttzElements + // works... + if (TLI.getTypeAction(StepVecVT.getSimpleVT()) == + TargetLowering::TypePromoteInteger) { + StepVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), StepVecVT); + StepVT = StepVecVT.getVectorElementType(); + } + + // Zero out lanes with inactive elements, then find the highest remaining + // value from the stepvector. + SDValue Zeroes = DAG.getConstant(0, DL, StepVecVT); + SDValue StepVec = DAG.getStepVector(DL, StepVecVT); + SDValue ActiveElts = DAG.getSelect(DL, StepVecVT, Mask, StepVec, Zeroes); + SDValue HighestIdx = DAG.getNode(ISD::VECREDUCE_UMAX, DL, StepVT, ActiveElts); + return DAG.getZExtOrTrunc(HighestIdx, DL, N->getValueType(0)); +} + SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG, bool IsNegative) const { SDLoc dl(N); diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index cc822ad5ec50e..73af0a9a71407 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -819,7 +819,7 @@ void TargetLoweringBase::initActions() { #include "llvm/IR/VPIntrinsics.def" // Masked vector extracts default to expand. - setOperationAction(ISD::VECTOR_EXTRACT_LAST_ACTIVE, VT, Expand); + setOperationAction(ISD::VECTOR_FIND_LAST_ACTIVE, VT, Expand); // FP environment operations default to expand. setOperationAction(ISD::GET_FPENV, VT, Expand); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 14fd7851bfa10..3ad2905ce5207 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -401,16 +401,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); } - // TODO: Should we include any other operations here? The calls to - // addDRType/addQRType below do mark VSELECT as Expand for the - // specified VTs, but leave other illegal types as the default - // of 'Legal'. LegalizeDAG doesn't legalize VSELECT after type - // legalization if LegalizeVectorOps introduces one. - for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) - setOperationAction(ISD::VSELECT, VT, Expand); - for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) - setOperationAction(ISD::VSELECT, VT, Expand); - if (Subtarget->hasNEON()) { addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); diff --git a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll index a0e9c6607042f..3b11e67d072e7 100644 --- a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll +++ b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll @@ -7,26 +7,21 @@ define i8 @extract_last_i8(<16 x i8> %data, <16 x i8> %mask, i8 %passthru) { ; NEON-FIXED: // %bb.0: ; NEON-FIXED-NEXT: sub sp, sp, #16 ; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 -; NEON-FIXED-NEXT: cmtst v2.16b, v1.16b, v1.16b -; NEON-FIXED-NEXT: cmeq v1.16b, v1.16b, #0 +; NEON-FIXED-NEXT: cmeq v2.16b, v1.16b, #0 ; NEON-FIXED-NEXT: adrp x8, .LCPI0_0 -; NEON-FIXED-NEXT: ldr q4, [x8, :lo12:.LCPI0_0] -; NEON-FIXED-NEXT: mov x11, sp +; NEON-FIXED-NEXT: cmtst v1.16b, v1.16b, v1.16b +; NEON-FIXED-NEXT: ldr q3, [x8, :lo12:.LCPI0_0] +; NEON-FIXED-NEXT: mov x9, sp ; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: bic v1.16b, v4.16b, v1.16b -; NEON-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; NEON-FIXED-NEXT: bic v2.16b, v3.16b, v2.16b ; NEON-FIXED-NEXT: umaxv b1, v1.16b -; NEON-FIXED-NEXT: orr v2.8b, v2.8b, v3.8b -; NEON-FIXED-NEXT: fmov w10, s1 -; NEON-FIXED-NEXT: fmov x8, d2 -; NEON-FIXED-NEXT: bfxil x11, x10, #0, #4 -; NEON-FIXED-NEXT: orr x8, x8, x8, lsr #32 -; NEON-FIXED-NEXT: lsr x9, x8, #16 -; NEON-FIXED-NEXT: orr w8, w8, w9 -; NEON-FIXED-NEXT: ldrb w9, [x11] -; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #8 -; NEON-FIXED-NEXT: tst w8, #0xff -; NEON-FIXED-NEXT: csel w0, w9, w0, ne +; NEON-FIXED-NEXT: umaxv b2, v2.16b +; NEON-FIXED-NEXT: fmov w8, s2 +; NEON-FIXED-NEXT: bfxil x9, x8, #0, #4 +; NEON-FIXED-NEXT: ldrb w8, [x9] +; NEON-FIXED-NEXT: fmov w9, s1 +; NEON-FIXED-NEXT: tst w9, #0x1 +; NEON-FIXED-NEXT: csel w0, w8, w0, ne ; NEON-FIXED-NEXT: add sp, sp, #16 ; NEON-FIXED-NEXT: ret ; @@ -34,25 +29,20 @@ define i8 @extract_last_i8(<16 x i8> %data, <16 x i8> %mask, i8 %passthru) { ; SVE-FIXED: // %bb.0: ; SVE-FIXED-NEXT: sub sp, sp, #16 ; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 -; SVE-FIXED-NEXT: cmtst v2.16b, v1.16b, v1.16b -; SVE-FIXED-NEXT: index z4.b, #0, #1 -; SVE-FIXED-NEXT: cmeq v1.16b, v1.16b, #0 -; SVE-FIXED-NEXT: mov x11, sp +; SVE-FIXED-NEXT: index z2.b, #0, #1 +; SVE-FIXED-NEXT: cmeq v3.16b, v1.16b, #0 +; SVE-FIXED-NEXT: cmtst v1.16b, v1.16b, v1.16b +; SVE-FIXED-NEXT: mov x9, sp ; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8 -; SVE-FIXED-NEXT: bic v1.16b, v4.16b, v1.16b +; SVE-FIXED-NEXT: bic v2.16b, v2.16b, v3.16b ; SVE-FIXED-NEXT: umaxv b1, v1.16b -; SVE-FIXED-NEXT: orr v2.8b, v2.8b, v3.8b -; SVE-FIXED-NEXT: fmov x8, d2 -; SVE-FIXED-NEXT: fmov w10, s1 -; SVE-FIXED-NEXT: orr x8, x8, x8, lsr #32 -; SVE-FIXED-NEXT: bfxil x11, x10, #0, #4 -; SVE-FIXED-NEXT: lsr x9, x8, #16 -; SVE-FIXED-NEXT: orr w8, w8, w9 -; SVE-FIXED-NEXT: ldrb w9, [x11] -; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #8 -; SVE-FIXED-NEXT: tst w8, #0xff -; SVE-FIXED-NEXT: csel w0, w9, w0, ne +; SVE-FIXED-NEXT: umaxv b2, v2.16b +; SVE-FIXED-NEXT: fmov w8, s2 +; SVE-FIXED-NEXT: bfxil x9, x8, #0, #4 +; SVE-FIXED-NEXT: ldrb w8, [x9] +; SVE-FIXED-NEXT: fmov w9, s1 +; SVE-FIXED-NEXT: tst w9, #0x1 +; SVE-FIXED-NEXT: csel w0, w8, w0, ne ; SVE-FIXED-NEXT: add sp, sp, #16 ; SVE-FIXED-NEXT: ret %notzero = icmp ne <16 x i8> %mask, zeroinitializer @@ -67,22 +57,19 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) { ; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 ; NEON-FIXED-NEXT: cmtst v1.8h, v1.8h, v1.8h ; NEON-FIXED-NEXT: adrp x8, .LCPI1_0 -; NEON-FIXED-NEXT: mov x11, sp -; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI1_0] +; NEON-FIXED-NEXT: mov x9, sp +; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI1_0] ; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: xtn v1.8b, v1.8h -; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; NEON-FIXED-NEXT: fmov x8, d1 -; NEON-FIXED-NEXT: umaxv b1, v2.8b -; NEON-FIXED-NEXT: orr x8, x8, x8, lsr #32 -; NEON-FIXED-NEXT: lsr x9, x8, #16 -; NEON-FIXED-NEXT: fmov w10, s1 -; NEON-FIXED-NEXT: orr w8, w8, w9 -; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #8 -; NEON-FIXED-NEXT: bfi x11, x10, #1, #3 -; NEON-FIXED-NEXT: tst w8, #0xff -; NEON-FIXED-NEXT: ldrh w9, [x11] -; NEON-FIXED-NEXT: csel w0, w9, w0, ne +; NEON-FIXED-NEXT: xtn v2.8b, v1.8h +; NEON-FIXED-NEXT: umaxv h1, v1.8h +; NEON-FIXED-NEXT: and v2.8b, v2.8b, v3.8b +; NEON-FIXED-NEXT: umaxv b2, v2.8b +; NEON-FIXED-NEXT: fmov w8, s2 +; NEON-FIXED-NEXT: bfi x9, x8, #1, #3 +; NEON-FIXED-NEXT: ldrh w8, [x9] +; NEON-FIXED-NEXT: fmov w9, s1 +; NEON-FIXED-NEXT: tst w9, #0x1 +; NEON-FIXED-NEXT: csel w0, w8, w0, ne ; NEON-FIXED-NEXT: add sp, sp, #16 ; NEON-FIXED-NEXT: ret ; @@ -91,22 +78,19 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) { ; SVE-FIXED-NEXT: sub sp, sp, #16 ; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 ; SVE-FIXED-NEXT: cmtst v1.8h, v1.8h, v1.8h -; SVE-FIXED-NEXT: index z2.b, #0, #1 -; SVE-FIXED-NEXT: mov x11, sp +; SVE-FIXED-NEXT: index z3.b, #0, #1 +; SVE-FIXED-NEXT: mov x9, sp ; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: xtn v1.8b, v1.8h -; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; SVE-FIXED-NEXT: fmov x8, d1 -; SVE-FIXED-NEXT: umaxv b1, v2.8b -; SVE-FIXED-NEXT: orr x8, x8, x8, lsr #32 -; SVE-FIXED-NEXT: lsr x9, x8, #16 -; SVE-FIXED-NEXT: fmov w10, s1 -; SVE-FIXED-NEXT: orr w8, w8, w9 -; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #8 -; SVE-FIXED-NEXT: bfi x11, x10, #1, #3 -; SVE-FIXED-NEXT: tst w8, #0xff -; SVE-FIXED-NEXT: ldrh w9, [x11] -; SVE-FIXED-NEXT: csel w0, w9, w0, ne +; SVE-FIXED-NEXT: xtn v2.8b, v1.8h +; SVE-FIXED-NEXT: umaxv h1, v1.8h +; SVE-FIXED-NEXT: and v2.8b, v2.8b, v3.8b +; SVE-FIXED-NEXT: umaxv b2, v2.8b +; SVE-FIXED-NEXT: fmov w8, s2 +; SVE-FIXED-NEXT: bfi x9, x8, #1, #3 +; SVE-FIXED-NEXT: ldrh w8, [x9] +; SVE-FIXED-NEXT: fmov w9, s1 +; SVE-FIXED-NEXT: tst w9, #0x1 +; SVE-FIXED-NEXT: csel w0, w8, w0, ne ; SVE-FIXED-NEXT: add sp, sp, #16 ; SVE-FIXED-NEXT: ret %notzero = icmp ne <8 x i16> %mask, zeroinitializer @@ -121,21 +105,19 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) { ; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 ; NEON-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s ; NEON-FIXED-NEXT: adrp x8, .LCPI2_0 -; NEON-FIXED-NEXT: mov x11, sp -; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI2_0] +; NEON-FIXED-NEXT: mov x9, sp +; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI2_0] ; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: xtn v1.4h, v1.4s -; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; NEON-FIXED-NEXT: fmov x8, d1 +; NEON-FIXED-NEXT: xtn v2.4h, v1.4s +; NEON-FIXED-NEXT: umaxv s1, v1.4s +; NEON-FIXED-NEXT: and v2.8b, v2.8b, v3.8b ; NEON-FIXED-NEXT: umaxv h2, v2.4h -; NEON-FIXED-NEXT: lsr x9, x8, #32 -; NEON-FIXED-NEXT: orr w8, w8, w9 -; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #16 -; NEON-FIXED-NEXT: fmov w10, s2 -; NEON-FIXED-NEXT: tst w8, #0xffff -; NEON-FIXED-NEXT: bfi x11, x10, #2, #2 -; NEON-FIXED-NEXT: ldr w9, [x11] -; NEON-FIXED-NEXT: csel w0, w9, w0, ne +; NEON-FIXED-NEXT: fmov w8, s2 +; NEON-FIXED-NEXT: bfi x9, x8, #2, #2 +; NEON-FIXED-NEXT: ldr w8, [x9] +; NEON-FIXED-NEXT: fmov w9, s1 +; NEON-FIXED-NEXT: tst w9, #0x1 +; NEON-FIXED-NEXT: csel w0, w8, w0, ne ; NEON-FIXED-NEXT: add sp, sp, #16 ; NEON-FIXED-NEXT: ret ; @@ -144,21 +126,19 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) { ; SVE-FIXED-NEXT: sub sp, sp, #16 ; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 ; SVE-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s -; SVE-FIXED-NEXT: index z2.h, #0, #1 -; SVE-FIXED-NEXT: mov x11, sp +; SVE-FIXED-NEXT: index z3.h, #0, #1 +; SVE-FIXED-NEXT: mov x9, sp ; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: xtn v1.4h, v1.4s -; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; SVE-FIXED-NEXT: fmov x8, d1 +; SVE-FIXED-NEXT: xtn v2.4h, v1.4s +; SVE-FIXED-NEXT: umaxv s1, v1.4s +; SVE-FIXED-NEXT: and v2.8b, v2.8b, v3.8b ; SVE-FIXED-NEXT: umaxv h2, v2.4h -; SVE-FIXED-NEXT: lsr x9, x8, #32 -; SVE-FIXED-NEXT: orr w8, w8, w9 -; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #16 -; SVE-FIXED-NEXT: fmov w10, s2 -; SVE-FIXED-NEXT: tst w8, #0xffff -; SVE-FIXED-NEXT: bfi x11, x10, #2, #2 -; SVE-FIXED-NEXT: ldr w9, [x11] -; SVE-FIXED-NEXT: csel w0, w9, w0, ne +; SVE-FIXED-NEXT: fmov w8, s2 +; SVE-FIXED-NEXT: bfi x9, x8, #2, #2 +; SVE-FIXED-NEXT: ldr w8, [x9] +; SVE-FIXED-NEXT: fmov w9, s1 +; SVE-FIXED-NEXT: tst w9, #0x1 +; SVE-FIXED-NEXT: csel w0, w8, w0, ne ; SVE-FIXED-NEXT: add sp, sp, #16 ; SVE-FIXED-NEXT: ret %notzero = icmp ne <4 x i32> %mask, zeroinitializer @@ -173,20 +153,19 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) { ; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 ; NEON-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d ; NEON-FIXED-NEXT: adrp x8, .LCPI3_0 -; NEON-FIXED-NEXT: mov x10, sp -; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI3_0] +; NEON-FIXED-NEXT: mov x9, sp +; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI3_0] ; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: xtn v1.2s, v1.2d -; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; NEON-FIXED-NEXT: fmov x8, d1 +; NEON-FIXED-NEXT: xtn v2.2s, v1.2d +; NEON-FIXED-NEXT: umaxv s1, v1.4s +; NEON-FIXED-NEXT: and v2.8b, v2.8b, v3.8b ; NEON-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s -; NEON-FIXED-NEXT: fmov w9, s2 -; NEON-FIXED-NEXT: bfi x10, x9, #3, #1 -; NEON-FIXED-NEXT: lsr x9, x8, #32 -; NEON-FIXED-NEXT: ldr x10, [x10] -; NEON-FIXED-NEXT: orr w8, w8, w9 -; NEON-FIXED-NEXT: cmp w8, #0 -; NEON-FIXED-NEXT: csel x0, x10, x0, ne +; NEON-FIXED-NEXT: fmov w8, s2 +; NEON-FIXED-NEXT: bfi x9, x8, #3, #1 +; NEON-FIXED-NEXT: ldr x8, [x9] +; NEON-FIXED-NEXT: fmov w9, s1 +; NEON-FIXED-NEXT: tst w9, #0x1 +; NEON-FIXED-NEXT: csel x0, x8, x0, ne ; NEON-FIXED-NEXT: add sp, sp, #16 ; NEON-FIXED-NEXT: ret ; @@ -195,20 +174,19 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) { ; SVE-FIXED-NEXT: sub sp, sp, #16 ; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 ; SVE-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d -; SVE-FIXED-NEXT: index z2.s, #0, #1 -; SVE-FIXED-NEXT: mov x10, sp +; SVE-FIXED-NEXT: index z3.s, #0, #1 +; SVE-FIXED-NEXT: mov x9, sp ; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: xtn v1.2s, v1.2d -; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; SVE-FIXED-NEXT: fmov x8, d1 +; SVE-FIXED-NEXT: xtn v2.2s, v1.2d +; SVE-FIXED-NEXT: umaxv s1, v1.4s +; SVE-FIXED-NEXT: and v2.8b, v2.8b, v3.8b ; SVE-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s -; SVE-FIXED-NEXT: fmov w9, s2 -; SVE-FIXED-NEXT: bfi x10, x9, #3, #1 -; SVE-FIXED-NEXT: lsr x9, x8, #32 -; SVE-FIXED-NEXT: ldr x10, [x10] -; SVE-FIXED-NEXT: orr w8, w8, w9 -; SVE-FIXED-NEXT: cmp w8, #0 -; SVE-FIXED-NEXT: csel x0, x10, x0, ne +; SVE-FIXED-NEXT: fmov w8, s2 +; SVE-FIXED-NEXT: bfi x9, x8, #3, #1 +; SVE-FIXED-NEXT: ldr x8, [x9] +; SVE-FIXED-NEXT: fmov w9, s1 +; SVE-FIXED-NEXT: tst w9, #0x1 +; SVE-FIXED-NEXT: csel x0, x8, x0, ne ; SVE-FIXED-NEXT: add sp, sp, #16 ; SVE-FIXED-NEXT: ret %notzero = icmp ne <2 x i64> %mask, zeroinitializer @@ -223,20 +201,18 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass ; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 ; NEON-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s ; NEON-FIXED-NEXT: adrp x8, .LCPI4_0 -; NEON-FIXED-NEXT: mov x11, sp -; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI4_0] +; NEON-FIXED-NEXT: mov x9, sp +; NEON-FIXED-NEXT: ldr d4, [x8, :lo12:.LCPI4_0] ; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: xtn v1.4h, v1.4s -; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b -; NEON-FIXED-NEXT: fmov x8, d1 +; NEON-FIXED-NEXT: xtn v3.4h, v1.4s +; NEON-FIXED-NEXT: umaxv s1, v1.4s +; NEON-FIXED-NEXT: and v3.8b, v3.8b, v4.8b ; NEON-FIXED-NEXT: umaxv h3, v3.4h -; NEON-FIXED-NEXT: lsr x9, x8, #32 -; NEON-FIXED-NEXT: orr w8, w8, w9 -; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #16 -; NEON-FIXED-NEXT: fmov w10, s3 -; NEON-FIXED-NEXT: tst w8, #0xffff -; NEON-FIXED-NEXT: bfi x11, x10, #2, #2 -; NEON-FIXED-NEXT: ldr s0, [x11] +; NEON-FIXED-NEXT: fmov w8, s3 +; NEON-FIXED-NEXT: bfi x9, x8, #2, #2 +; NEON-FIXED-NEXT: fmov w8, s1 +; NEON-FIXED-NEXT: ldr s0, [x9] +; NEON-FIXED-NEXT: tst w8, #0x1 ; NEON-FIXED-NEXT: fcsel s0, s0, s2, ne ; NEON-FIXED-NEXT: add sp, sp, #16 ; NEON-FIXED-NEXT: ret @@ -246,20 +222,18 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass ; SVE-FIXED-NEXT: sub sp, sp, #16 ; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 ; SVE-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s -; SVE-FIXED-NEXT: index z3.h, #0, #1 -; SVE-FIXED-NEXT: mov x11, sp +; SVE-FIXED-NEXT: index z4.h, #0, #1 +; SVE-FIXED-NEXT: mov x9, sp ; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: xtn v1.4h, v1.4s -; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b -; SVE-FIXED-NEXT: fmov x8, d1 +; SVE-FIXED-NEXT: xtn v3.4h, v1.4s +; SVE-FIXED-NEXT: umaxv s1, v1.4s +; SVE-FIXED-NEXT: and v3.8b, v3.8b, v4.8b ; SVE-FIXED-NEXT: umaxv h3, v3.4h -; SVE-FIXED-NEXT: lsr x9, x8, #32 -; SVE-FIXED-NEXT: orr w8, w8, w9 -; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #16 -; SVE-FIXED-NEXT: fmov w10, s3 -; SVE-FIXED-NEXT: tst w8, #0xffff -; SVE-FIXED-NEXT: bfi x11, x10, #2, #2 -; SVE-FIXED-NEXT: ldr s0, [x11] +; SVE-FIXED-NEXT: fmov w8, s3 +; SVE-FIXED-NEXT: bfi x9, x8, #2, #2 +; SVE-FIXED-NEXT: fmov w8, s1 +; SVE-FIXED-NEXT: ldr s0, [x9] +; SVE-FIXED-NEXT: tst w8, #0x1 ; SVE-FIXED-NEXT: fcsel s0, s0, s2, ne ; SVE-FIXED-NEXT: add sp, sp, #16 ; SVE-FIXED-NEXT: ret @@ -275,19 +249,18 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double % ; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16 ; NEON-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d ; NEON-FIXED-NEXT: adrp x8, .LCPI5_0 -; NEON-FIXED-NEXT: mov x10, sp -; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI5_0] +; NEON-FIXED-NEXT: mov x9, sp +; NEON-FIXED-NEXT: ldr d4, [x8, :lo12:.LCPI5_0] ; NEON-FIXED-NEXT: str q0, [sp] -; NEON-FIXED-NEXT: xtn v1.2s, v1.2d -; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b -; NEON-FIXED-NEXT: fmov x8, d1 +; NEON-FIXED-NEXT: xtn v3.2s, v1.2d +; NEON-FIXED-NEXT: umaxv s1, v1.4s +; NEON-FIXED-NEXT: and v3.8b, v3.8b, v4.8b ; NEON-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s -; NEON-FIXED-NEXT: fmov w9, s3 -; NEON-FIXED-NEXT: bfi x10, x9, #3, #1 -; NEON-FIXED-NEXT: lsr x9, x8, #32 -; NEON-FIXED-NEXT: ldr d0, [x10] -; NEON-FIXED-NEXT: orr w8, w8, w9 -; NEON-FIXED-NEXT: cmp w8, #0 +; NEON-FIXED-NEXT: fmov w8, s3 +; NEON-FIXED-NEXT: bfi x9, x8, #3, #1 +; NEON-FIXED-NEXT: fmov w8, s1 +; NEON-FIXED-NEXT: ldr d0, [x9] +; NEON-FIXED-NEXT: tst w8, #0x1 ; NEON-FIXED-NEXT: fcsel d0, d0, d2, ne ; NEON-FIXED-NEXT: add sp, sp, #16 ; NEON-FIXED-NEXT: ret @@ -297,19 +270,18 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double % ; SVE-FIXED-NEXT: sub sp, sp, #16 ; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16 ; SVE-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d -; SVE-FIXED-NEXT: index z3.s, #0, #1 -; SVE-FIXED-NEXT: mov x10, sp +; SVE-FIXED-NEXT: index z4.s, #0, #1 +; SVE-FIXED-NEXT: mov x9, sp ; SVE-FIXED-NEXT: str q0, [sp] -; SVE-FIXED-NEXT: xtn v1.2s, v1.2d -; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b -; SVE-FIXED-NEXT: fmov x8, d1 +; SVE-FIXED-NEXT: xtn v3.2s, v1.2d +; SVE-FIXED-NEXT: umaxv s1, v1.4s +; SVE-FIXED-NEXT: and v3.8b, v3.8b, v4.8b ; SVE-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s -; SVE-FIXED-NEXT: fmov w9, s3 -; SVE-FIXED-NEXT: bfi x10, x9, #3, #1 -; SVE-FIXED-NEXT: lsr x9, x8, #32 -; SVE-FIXED-NEXT: ldr d0, [x10] -; SVE-FIXED-NEXT: orr w8, w8, w9 -; SVE-FIXED-NEXT: cmp w8, #0 +; SVE-FIXED-NEXT: fmov w8, s3 +; SVE-FIXED-NEXT: bfi x9, x8, #3, #1 +; SVE-FIXED-NEXT: fmov w8, s1 +; SVE-FIXED-NEXT: ldr d0, [x9] +; SVE-FIXED-NEXT: tst w8, #0x1 ; SVE-FIXED-NEXT: fcsel d0, d0, d2, ne ; SVE-FIXED-NEXT: add sp, sp, #16 ; SVE-FIXED-NEXT: ret @@ -430,6 +402,24 @@ define double @extract_last_double_scalable( %data, %data, %mask) #0 { +; CHECK-LABEL: extract_last_i8_scalable_poison_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: index z1.b, #0, #1 +; CHECK-NEXT: mov z2.b, #0 // =0x0 +; CHECK-NEXT: sel z1.b, p0, z1.b, z2.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: umaxv b1, p0, z1.b +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: and x8, x8, #0xff +; CHECK-NEXT: whilels p0.b, xzr, x8 +; CHECK-NEXT: lastb w0, p0, z0.b +; CHECK-NEXT: ret + %res = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8( %data, %mask, i8 poison) + ret i8 %res +} + declare i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8) declare i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16) declare i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll b/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll index 81ff400b38cb4..10929394af75f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll @@ -76,31 +76,23 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) { ; RV32-LABEL: extract_last_i64: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vmsne.vi v9, v9, 0 -; RV32-NEXT: vmv.v.i v0, 1 -; RV32-NEXT: vslidedown.vi v10, v8, 1 -; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.i v12, 0 -; RV32-NEXT: vmv1r.v v11, v10 -; RV32-NEXT: vcpop.m a2, v9 -; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; RV32-NEXT: vrgather.vi v11, v8, 1, v0.t -; RV32-NEXT: vmv1r.v v0, v9 +; RV32-NEXT: vmsne.vi v0, v9, 0 ; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; RV32-NEXT: vid.v v12, v0.t +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vcpop.m a2, v0 +; RV32-NEXT: vid.v v9, v0.t ; RV32-NEXT: beqz a2, .LBB3_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: vredmaxu.vs v9, v12, v12 -; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV32-NEXT: vslideup.vi v8, v10, 1 -; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; RV32-NEXT: vredmaxu.vs v9, v9, v9 +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vmv.x.s a0, v9 ; RV32-NEXT: andi a0, a0, 255 -; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV32-NEXT: vslidedown.vx v9, v11, a0 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; RV32-NEXT: vslidedown.vx v8, v8, a0 -; RV32-NEXT: vmv.x.s a1, v9 ; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: .LBB3_2: ; RV32-NEXT: ret ; @@ -176,115 +168,64 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double % } define i8 @extract_last_i8_scalable( %data, %mask, i8 %passthru) { -; RV32-LABEL: extract_last_i8_scalable: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, mu -; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: vcpop.m a1, v0 -; RV32-NEXT: vid.v v16, v0.t -; RV32-NEXT: beqz a1, .LBB6_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: vredmaxu.vs v10, v16, v16 -; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; RV32-NEXT: vslidedown.vx v8, v8, a0 -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: .LBB6_2: -; RV32-NEXT: ret -; -; RV64-LABEL: extract_last_i8_scalable: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, mu -; RV64-NEXT: vmv.v.i v16, 0 -; RV64-NEXT: vcpop.m a1, v0 -; RV64-NEXT: vid.v v16, v0.t -; RV64-NEXT: beqz a1, .LBB6_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: vredmaxu.vs v10, v16, v16 -; RV64-NEXT: vmv.x.s a0, v10 -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; RV64-NEXT: vslidedown.vx v8, v8, a0 -; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: .LBB6_2: -; RV64-NEXT: ret +; CHECK-LABEL: extract_last_i8_scalable: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vid.v v10, v0.t +; CHECK-NEXT: beqz a1, .LBB6_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vredmaxu.vs v10, v10, v10 +; CHECK-NEXT: vmv.x.s a0, v10 +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: .LBB6_2: +; CHECK-NEXT: ret %res = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8( %data, %mask, i8 %passthru) ret i8 %res } define i16 @extract_last_i16_scalable( %data, %mask, i16 %passthru) { -; RV32-LABEL: extract_last_i16_scalable: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu -; RV32-NEXT: vmv.v.i v12, 0 -; RV32-NEXT: vcpop.m a1, v0 -; RV32-NEXT: vid.v v12, v0.t -; RV32-NEXT: beqz a1, .LBB7_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: vredmaxu.vs v10, v12, v12 -; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vslidedown.vx v8, v8, a0 -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: .LBB7_2: -; RV32-NEXT: ret -; -; RV64-LABEL: extract_last_i16_scalable: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e32, m4, ta, mu -; RV64-NEXT: vmv.v.i v12, 0 -; RV64-NEXT: vcpop.m a1, v0 -; RV64-NEXT: vid.v v12, v0.t -; RV64-NEXT: beqz a1, .LBB7_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: vredmaxu.vs v10, v12, v12 -; RV64-NEXT: vmv.x.s a0, v10 -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64-NEXT: vslidedown.vx v8, v8, a0 -; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: .LBB7_2: -; RV64-NEXT: ret +; CHECK-LABEL: extract_last_i16_scalable: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vid.v v10, v0.t +; CHECK-NEXT: beqz a1, .LBB7_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vredmaxu.vs v10, v10, v10 +; CHECK-NEXT: vmv.x.s a0, v10 +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: .LBB7_2: +; CHECK-NEXT: ret %res = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16( %data, %mask, i16 %passthru) ret i16 %res } define i32 @extract_last_i32_scalable( %data, %mask, i32 %passthru) { -; RV32-LABEL: extract_last_i32_scalable: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, mu -; RV32-NEXT: vmv.v.i v10, 0 -; RV32-NEXT: vcpop.m a1, v0 -; RV32-NEXT: vid.v v10, v0.t -; RV32-NEXT: beqz a1, .LBB8_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: vredmaxu.vs v10, v10, v10 -; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32-NEXT: vslidedown.vx v8, v8, a0 -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: .LBB8_2: -; RV32-NEXT: ret -; -; RV64-LABEL: extract_last_i32_scalable: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, mu -; RV64-NEXT: vmv.v.i v10, 0 -; RV64-NEXT: vcpop.m a1, v0 -; RV64-NEXT: vid.v v10, v0.t -; RV64-NEXT: beqz a1, .LBB8_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: vredmaxu.vs v10, v10, v10 -; RV64-NEXT: vmv.x.s a0, v10 -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64-NEXT: vslidedown.vx v8, v8, a0 -; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: .LBB8_2: -; RV64-NEXT: ret +; CHECK-LABEL: extract_last_i32_scalable: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vid.v v10, v0.t +; CHECK-NEXT: beqz a1, .LBB8_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vredmaxu.vs v10, v10, v10 +; CHECK-NEXT: vmv.x.s a0, v10 +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: .LBB8_2: +; CHECK-NEXT: ret %res = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32( %data, %mask, i32 %passthru) ret i32 %res } @@ -292,27 +233,28 @@ define i32 @extract_last_i32_scalable( %data, %data, %mask, i64 %passthru) { ; RV32-LABEL: extract_last_i64_scalable: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, mu ; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: vcpop.m a2, v0 ; RV32-NEXT: vid.v v10, v0.t ; RV32-NEXT: beqz a2, .LBB9_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: vredmaxu.vs v10, v10, v10 -; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: vnsrl.wi v10, v8, 0 ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vnsrl.wx v11, v8, a1 -; RV32-NEXT: vslidedown.vx v8, v10, a0 -; RV32-NEXT: vslidedown.vx v9, v11, a0 +; RV32-NEXT: vmv.x.s a0, v10 +; RV32-NEXT: andi a0, a0, 255 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vslidedown.vx v8, v8, a0 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: vmv.x.s a1, v9 +; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: .LBB9_2: ; RV32-NEXT: ret ; ; RV64-LABEL: extract_last_i64_scalable: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, mu ; RV64-NEXT: vmv.v.i v10, 0 ; RV64-NEXT: vcpop.m a1, v0 ; RV64-NEXT: vid.v v10, v0.t @@ -320,8 +262,7 @@ define i64 @extract_last_i64_scalable( %data, %data, %data, %mask, float %passthru) { -; RV32-LABEL: extract_last_float_scalable: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; RV32-NEXT: vmv.v.i v10, 0 -; RV32-NEXT: vcpop.m a0, v0 -; RV32-NEXT: vid.v v10, v0.t -; RV32-NEXT: beqz a0, .LBB10_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: vredmaxu.vs v10, v10, v10 -; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32-NEXT: vslidedown.vx v8, v8, a0 -; RV32-NEXT: vfmv.f.s fa0, v8 -; RV32-NEXT: .LBB10_2: -; RV32-NEXT: ret -; -; RV64-LABEL: extract_last_float_scalable: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; RV64-NEXT: vmv.v.i v10, 0 -; RV64-NEXT: vcpop.m a0, v0 -; RV64-NEXT: vid.v v10, v0.t -; RV64-NEXT: beqz a0, .LBB10_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: vredmaxu.vs v10, v10, v10 -; RV64-NEXT: vmv.x.s a0, v10 -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64-NEXT: vslidedown.vx v8, v8, a0 -; RV64-NEXT: vfmv.f.s fa0, v8 -; RV64-NEXT: .LBB10_2: -; RV64-NEXT: ret +; CHECK-LABEL: extract_last_float_scalable: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vcpop.m a0, v0 +; CHECK-NEXT: vid.v v10, v0.t +; CHECK-NEXT: beqz a0, .LBB10_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vredmaxu.vs v10, v10, v10 +; CHECK-NEXT: vmv.x.s a0, v10 +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: ret %res = call float @llvm.experimental.vector.extract.last.active.nxv4f32( %data, %mask, float %passthru) ret float %res } define double @extract_last_double_scalable( %data, %mask, double %passthru) { -; RV32-LABEL: extract_last_double_scalable: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; RV32-NEXT: vmv.v.i v10, 0 -; RV32-NEXT: vcpop.m a0, v0 -; RV32-NEXT: vid.v v10, v0.t -; RV32-NEXT: beqz a0, .LBB11_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: vredmaxu.vs v10, v10, v10 -; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32-NEXT: vslidedown.vx v8, v8, a0 -; RV32-NEXT: vfmv.f.s fa0, v8 -; RV32-NEXT: .LBB11_2: -; RV32-NEXT: ret -; -; RV64-LABEL: extract_last_double_scalable: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; RV64-NEXT: vmv.v.i v10, 0 -; RV64-NEXT: vcpop.m a0, v0 -; RV64-NEXT: vid.v v10, v0.t -; RV64-NEXT: beqz a0, .LBB11_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: vredmaxu.vs v10, v10, v10 -; RV64-NEXT: vmv.x.s a0, v10 -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV64-NEXT: vslidedown.vx v8, v8, a0 -; RV64-NEXT: vfmv.f.s fa0, v8 -; RV64-NEXT: .LBB11_2: -; RV64-NEXT: ret +; CHECK-LABEL: extract_last_double_scalable: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vcpop.m a0, v0 +; CHECK-NEXT: vid.v v10, v0.t +; CHECK-NEXT: beqz a0, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vredmaxu.vs v10, v10, v10 +; CHECK-NEXT: vmv.x.s a0, v10 +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: ret %res = call double @llvm.experimental.vector.extract.last.active.nxv2f64( %data, %mask, double %passthru) ret double %res } From 52e55c0983d96616ea42e8c70ad54bbcd9a07b2c Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Mon, 13 Jan 2025 15:55:56 +0000 Subject: [PATCH 3/3] Remove 32b hardcoded upper bound, clarify type promotion --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 20 ++++++------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 809948c8178c7..90ac79cfb0e3b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9462,28 +9462,20 @@ SDValue TargetLowering::expandVectorFindLastActive(SDNode *N, EVT BoolVT = MaskVT.getScalarType(); // Find a suitable type for a stepvector. - ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value. + ConstantRange VScaleRange(1, /*isFullSet=*/true); // Fixed length default. if (MaskVT.isScalableVector()) VScaleRange = getVScaleRange(&DAG.getMachineFunction().getFunction(), 64); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned EltWidth = TLI.getBitWidthForCttzElements( BoolVT.getTypeForEVT(*DAG.getContext()), MaskVT.getVectorElementCount(), /*ZeroIsPoison=*/true, &VScaleRange); - - // FIXME: If the target selects a VT that's too wide based on the legal types - // for a vecreduce_umax, if will force expansion of the node -- which - // doesn't work on scalable vectors... - // Is there another method we could use to get a smaller VT instead - // of just capping to 32b? - EVT StepVT = MVT::getIntegerVT(std::min(EltWidth, 32u)); + EVT StepVT = MVT::getIntegerVT(EltWidth); EVT StepVecVT = MaskVT.changeVectorElementType(StepVT); - // FIXME: If the target selects a VT that's too small to form a legal vector - // type, we also run into problems if expanding after type - // legalization. - // - // I think perhaps we need to revisit how getBitWidthForCttzElements - // works... + // If promotion is required to make the type legal, do it here; promotion + // of integers within LegalizeVectorOps is looking for types of the same + // size but with a smaller number of larger elements, not the usual larger + // size with the same number of larger elements. if (TLI.getTypeAction(StepVecVT.getSimpleVT()) == TargetLowering::TypePromoteInteger) { StepVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), StepVecVT);