diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index f84e83816bf33..2f35bee5c526f 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -155,6 +155,13 @@ def shuf_to_ins: GICombineRule < (apply [{ applyINS(*${root}, MRI, B, ${matchinfo}); }]) >; +def perfect_shuffle: GICombineRule < + (defs root:$root), + (match (G_SHUFFLE_VECTOR $dst, $src1, $src2, $mask):$root, + [{ return matchPerfectShuffle(*${root}, MRI); }]), + (apply [{ applyPerfectShuffle(*${root}, MRI, B); }]) +>; + def vashr_vlshr_imm_matchdata : GIDefMatchData<"int64_t">; def vashr_vlshr_imm : GICombineRule< (defs root:$root, vashr_vlshr_imm_matchdata:$matchinfo), @@ -173,7 +180,8 @@ def form_duplane : GICombineRule < >; def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn, fullrev, - form_duplane, shuf_to_ins]>; + form_duplane, shuf_to_ins, + perfect_shuffle]>; // Turn G_UNMERGE_VALUES -> G_EXTRACT_VECTOR_ELT's def vector_unmerge_lowering : GICombineRule < diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 771eee1b3fecf..314ca46a2e253 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13507,172 +13507,6 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); } -/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit -/// the specified operations to build the shuffle. ID is the perfect-shuffle -//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle -//table entry and LHS/RHS are the immediate inputs for this stage of the -//shuffle. -static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, - SDValue V2, unsigned PFEntry, SDValue LHS, - SDValue RHS, SelectionDAG &DAG, - const SDLoc &dl) { - unsigned OpNum = (PFEntry >> 26) & 0x0F; - unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); - unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); - - enum { - OP_COPY = 0, // Copy, used for things like to say it is <0,1,2,3> - OP_VREV, - OP_VDUP0, - OP_VDUP1, - OP_VDUP2, - OP_VDUP3, - OP_VEXT1, - OP_VEXT2, - OP_VEXT3, - OP_VUZPL, // VUZP, left result - OP_VUZPR, // VUZP, right result - OP_VZIPL, // VZIP, left result - OP_VZIPR, // VZIP, right result - OP_VTRNL, // VTRN, left result - OP_VTRNR, // VTRN, right result - OP_MOVLANE // Move lane. RHSID is the lane to move into - }; - - if (OpNum == OP_COPY) { - if (LHSID == (1 * 9 + 2) * 9 + 3) - return LHS; - assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); - return RHS; - } - - if (OpNum == OP_MOVLANE) { - // Decompose a PerfectShuffle ID to get the Mask for lane Elt - auto getPFIDLane = [](unsigned ID, int Elt) -> int { - assert(Elt < 4 && "Expected Perfect Lanes to be less than 4"); - Elt = 3 - Elt; - while (Elt > 0) { - ID /= 9; - Elt--; - } - return (ID % 9 == 8) ? -1 : ID % 9; - }; - - // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We - // get the lane to move from the PFID, which is always from the - // original vectors (V1 or V2). - SDValue OpLHS = GeneratePerfectShuffle( - LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); - EVT VT = OpLHS.getValueType(); - assert(RHSID < 8 && "Expected a lane index for RHSID!"); - unsigned ExtLane = 0; - SDValue Input; - - // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs - // convert into a higher type. - if (RHSID & 0x4) { - int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1; - if (MaskElt == -1) - MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1; - assert(MaskElt >= 0 && "Didn't expect an undef movlane index!"); - ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2); - Input = MaskElt < 2 ? V1 : V2; - if (VT.getScalarSizeInBits() == 16) { - Input = DAG.getBitcast(MVT::v2f32, Input); - OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS); - } else { - assert(VT.getScalarSizeInBits() == 32 && - "Expected 16 or 32 bit shuffle elemements"); - Input = DAG.getBitcast(MVT::v2f64, Input); - OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS); - } - } else { - int MaskElt = getPFIDLane(ID, RHSID); - assert(MaskElt >= 0 && "Didn't expect an undef movlane index!"); - ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4); - Input = MaskElt < 4 ? V1 : V2; - // Be careful about creating illegal types. Use f16 instead of i16. - if (VT == MVT::v4i16) { - Input = DAG.getBitcast(MVT::v4f16, Input); - OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS); - } - } - SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, - Input.getValueType().getVectorElementType(), - Input, DAG.getVectorIdxConstant(ExtLane, dl)); - SDValue Ins = - DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS, - Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl)); - return DAG.getBitcast(VT, Ins); - } - - SDValue OpLHS, OpRHS; - OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, - RHS, DAG, dl); - OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS, - RHS, DAG, dl); - EVT VT = OpLHS.getValueType(); - - switch (OpNum) { - default: - llvm_unreachable("Unknown shuffle opcode!"); - case OP_VREV: - // VREV divides the vector in half and swaps within the half. - if (VT.getVectorElementType() == MVT::i32 || - VT.getVectorElementType() == MVT::f32) - return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); - // vrev <4 x i16> -> REV32 - if (VT.getVectorElementType() == MVT::i16 || - VT.getVectorElementType() == MVT::f16 || - VT.getVectorElementType() == MVT::bf16) - return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); - // vrev <4 x i8> -> REV16 - assert(VT.getVectorElementType() == MVT::i8); - return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); - case OP_VDUP0: - case OP_VDUP1: - case OP_VDUP2: - case OP_VDUP3: { - EVT EltTy = VT.getVectorElementType(); - unsigned Opcode; - if (EltTy == MVT::i8) - Opcode = AArch64ISD::DUPLANE8; - else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16) - Opcode = AArch64ISD::DUPLANE16; - else if (EltTy == MVT::i32 || EltTy == MVT::f32) - Opcode = AArch64ISD::DUPLANE32; - else if (EltTy == MVT::i64 || EltTy == MVT::f64) - Opcode = AArch64ISD::DUPLANE64; - else - llvm_unreachable("Invalid vector element type?"); - - if (VT.getSizeInBits() == 64) - OpLHS = WidenVector(OpLHS, DAG); - SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); - return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); - } - case OP_VEXT1: - case OP_VEXT2: - case OP_VEXT3: { - unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); - return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, - DAG.getConstant(Imm, dl, MVT::i32)); - } - case OP_VUZPL: - return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS); - case OP_VUZPR: - return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS); - case OP_VZIPL: - return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS); - case OP_VZIPR: - return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS); - case OP_VTRNL: - return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS); - case OP_VTRNR: - return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS); - } -} - static SDValue GenerateTBL(SDValue Op, ArrayRef ShuffleMask, SelectionDAG &DAG) { // Check to see if we can use the TBL instruction. @@ -14096,8 +13930,102 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + PFIndexes[2] * 9 + PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; - return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG, - dl); + + auto BuildRev = [&DAG, &dl](SDValue OpLHS) { + EVT VT = OpLHS.getValueType(); + unsigned Opcode = VT.getScalarSizeInBits() == 32 ? AArch64ISD::REV64 + : VT.getScalarSizeInBits() == 16 ? AArch64ISD::REV32 + : AArch64ISD::REV16; + return DAG.getNode(Opcode, dl, VT, OpLHS); + }; + auto BuildDup = [&DAG, &dl](SDValue OpLHS, unsigned Lane) { + EVT VT = OpLHS.getValueType(); + unsigned Opcode; + if (VT.getScalarSizeInBits() == 8) + Opcode = AArch64ISD::DUPLANE8; + else if (VT.getScalarSizeInBits() == 16) + Opcode = AArch64ISD::DUPLANE16; + else if (VT.getScalarSizeInBits() == 32) + Opcode = AArch64ISD::DUPLANE32; + else if (VT.getScalarSizeInBits() == 64) + Opcode = AArch64ISD::DUPLANE64; + else + llvm_unreachable("Invalid vector element type?"); + + if (VT.getSizeInBits() == 64) + OpLHS = WidenVector(OpLHS, DAG); + return DAG.getNode(Opcode, dl, VT, OpLHS, + DAG.getConstant(Lane, dl, MVT::i64)); + }; + auto BuildExt = [&DAG, &dl](SDValue OpLHS, SDValue OpRHS, unsigned Imm) { + EVT VT = OpLHS.getValueType(); + Imm = Imm * getExtFactor(OpLHS); + return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, + DAG.getConstant(Imm, dl, MVT::i32)); + }; + auto BuildZipLike = [&DAG, &dl](unsigned OpNum, SDValue OpLHS, + SDValue OpRHS) { + EVT VT = OpLHS.getValueType(); + unsigned Opc = 0; + switch (OpNum) { + default: + llvm_unreachable("Unexpected perfect shuffle opcode"); + case OP_VUZPL: + Opc = AArch64ISD::UZP1; + break; + case OP_VUZPR: + Opc = AArch64ISD::UZP2; + break; + case OP_VZIPL: + Opc = AArch64ISD::ZIP1; + break; + case OP_VZIPR: + Opc = AArch64ISD::ZIP2; + break; + case OP_VTRNL: + Opc = AArch64ISD::TRN1; + break; + case OP_VTRNR: + Opc = AArch64ISD::TRN2; + } + return DAG.getNode(Opc, dl, VT, OpLHS, OpRHS); + }; + auto BuildExtractInsert64 = [&DAG, &dl](SDValue ExtSrc, unsigned ExtLane, + SDValue InsSrc, unsigned InsLane) { + EVT VT = InsSrc.getValueType(); + if (VT.getScalarSizeInBits() == 16) { + ExtSrc = DAG.getBitcast(MVT::v2f32, ExtSrc); + InsSrc = DAG.getBitcast(MVT::v2f32, InsSrc); + } else if (VT.getScalarSizeInBits() == 32) { + ExtSrc = DAG.getBitcast(MVT::v2f64, ExtSrc); + InsSrc = DAG.getBitcast(MVT::v2f64, InsSrc); + } + SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + ExtSrc.getValueType().getVectorElementType(), + ExtSrc, DAG.getConstant(ExtLane, dl, MVT::i64)); + SDValue Ins = + DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtSrc.getValueType(), InsSrc, + Ext, DAG.getConstant(InsLane, dl, MVT::i64)); + return DAG.getBitcast(VT, Ins); + }; + auto BuildExtractInsert32 = [&DAG, &dl](SDValue ExtSrc, unsigned ExtLane, + SDValue InsSrc, unsigned InsLane) { + EVT VT = InsSrc.getValueType(); + if (VT.getScalarSizeInBits() == 16) { + ExtSrc = DAG.getBitcast(MVT::v4f16, ExtSrc); + InsSrc = DAG.getBitcast(MVT::v4f16, InsSrc); + } + SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + ExtSrc.getValueType().getVectorElementType(), + ExtSrc, DAG.getConstant(ExtLane, dl, MVT::i64)); + SDValue Ins = + DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtSrc.getValueType(), InsSrc, + Ext, DAG.getConstant(InsLane, dl, MVT::i64)); + return DAG.getBitcast(VT, Ins); + }; + return generatePerfectShuffle( + PFTableIndex, V1, V2, PFEntry, V1, V2, BuildExtractInsert64, + BuildExtractInsert32, BuildRev, BuildDup, BuildExt, BuildZipLike); } // Check for a "select shuffle", generating a BSL to pick between lanes in diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h index 7b044cf7c238f..ea72db2622f35 100644 --- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h +++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -6588,6 +6588,25 @@ static const unsigned PerfectShuffleTable[6561 + 1] = { 835584U, // : Cost 0 copy LHS 0}; +enum { + OP_COPY = 0, // Copy, used for things like to say it is <0,1,2,3> + OP_VREV, + OP_VDUP0, + OP_VDUP1, + OP_VDUP2, + OP_VDUP3, + OP_VEXT1, + OP_VEXT2, + OP_VEXT3, + OP_VUZPL, // VUZP, left result + OP_VUZPR, // VUZP, right result + OP_VZIPL, // VZIP, left result + OP_VZIPR, // VZIP, right result + OP_VTRNL, // VTRN, left result + OP_VTRNR, // VTRN, right result + OP_MOVLANE // Move lane. RHSID is the lane to move into +}; + inline unsigned getPerfectShuffleCost(llvm::ArrayRef M) { assert(M.size() == 4 && "Expected a 4 entry perfect shuffle"); @@ -6723,6 +6742,109 @@ inline bool isREVMask(ArrayRef M, unsigned EltSize, unsigned NumElts, return true; } +/// Generate perfect shuffles, shared between SDAG and GISel. + +/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit +/// the specified operations to build the shuffle. ID is the perfect-shuffle +/// ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect +/// shuffle table entry and LHS/RHS are the immediate inputs for this stage of +/// the shuffle. The implementations is shared between SDAG and GISel. +template +inline Val generatePerfectShuffle(unsigned ID, Val V1, Val V2, unsigned PFEntry, + Val LHS, Val RHS, + ExtInsFn64 BuildExtractInsert64, + ExtInsFn32 BuildExtractInsert32, + ValFn BuildRev, ValImmFn BuildDup, + ValValFn BuildExt, ValValFn2 BuildZip) { + unsigned OpNum = (PFEntry >> 26) & 0x0F; + unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); + unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); + + if (OpNum == OP_COPY) { + if (LHSID == (1 * 9 + 2) * 9 + 3) + return LHS; + assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); + return RHS; + } + + if (OpNum == OP_MOVLANE) { + // Decompose a PerfectShuffle ID to get the Mask for lane Elt + auto getPFIDLane = [](unsigned ID, int Elt) -> int { + assert(Elt < 4 && "Expected Perfect Lanes to be less than 4"); + Elt = 3 - Elt; + while (Elt > 0) { + ID /= 9; + Elt--; + } + return (ID % 9 == 8) ? -1 : ID % 9; + }; + + // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We + // get the lane to move from the PFID, which is always from the + // original vectors (V1 or V2). + Val OpLHS = generatePerfectShuffle( + LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, + BuildExtractInsert64, BuildExtractInsert32, BuildRev, BuildDup, + BuildExt, BuildZip); + assert(RHSID < 8 && "Expected a lane index for RHSID!"); + unsigned ExtLane = 0; + Val Input; + + // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs + // convert into a higher type. + if (RHSID & 0x4) { + int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1; + if (MaskElt == -1) + MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1; + assert(MaskElt >= 0 && "Didn't expect an undef movlane index!"); + ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2); + Input = MaskElt < 2 ? V1 : V2; + return BuildExtractInsert64(Input, ExtLane, OpLHS, RHSID & 0x3); + } + int MaskElt = getPFIDLane(ID, RHSID); + assert(MaskElt >= 0 && "Didn't expect an undef movlane index!"); + ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4); + Input = MaskElt < 4 ? V1 : V2; + return BuildExtractInsert32(Input, ExtLane, OpLHS, RHSID & 0x3); + } + + Val OpLHS, OpRHS; + OpLHS = generatePerfectShuffle( + LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, BuildExtractInsert64, + BuildExtractInsert32, BuildRev, BuildDup, BuildExt, BuildZip); + OpRHS = generatePerfectShuffle( + RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS, RHS, BuildExtractInsert64, + BuildExtractInsert32, BuildRev, BuildDup, BuildExt, BuildZip); + + switch (OpNum) { + default: + llvm_unreachable("Unknown shuffle opcode!"); + case OP_VREV: + // VREV divides the vector in half and swaps within the half. + return BuildRev(OpLHS); + case OP_VDUP0: + case OP_VDUP1: + case OP_VDUP2: + case OP_VDUP3: + return BuildDup(OpLHS, OpNum - OP_VDUP0); + case OP_VEXT1: + case OP_VEXT2: + case OP_VEXT3: { + unsigned Imm = OpNum - OP_VEXT1 + 1; + return BuildExt(OpLHS, OpRHS, Imm); + } + case OP_VUZPL: + case OP_VUZPR: + case OP_VZIPL: + case OP_VZIPR: + case OP_VTRNL: + case OP_VTRNR: + return BuildZip(OpNum, OpLHS, OpRHS); + } +} + } // namespace llvm #endif diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index 4785c7b68d94d..86d7c06e5e2f0 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -524,6 +524,140 @@ void applyINS(MachineInstr &MI, MachineRegisterInfo &MRI, MI.eraseFromParent(); } +/// Match 4 elemental G_SHUFFLE_VECTOR +bool matchPerfectShuffle(MachineInstr &MI, MachineRegisterInfo &MRI) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + return MRI.getType(MI.getOperand(0).getReg()).getNumElements() == 4; +} + +void applyPerfectShuffle(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIB) { + Register Dst = MI.getOperand(0).getReg(); + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + ArrayRef ShuffleMask = MI.getOperand(3).getShuffleMask(); + assert(ShuffleMask.size() == 4 && "Expected 4 element mask"); + + unsigned PFIndexes[4]; + for (unsigned i = 0; i != 4; ++i) { + if (ShuffleMask[i] < 0) + PFIndexes[i] = 8; + else + PFIndexes[i] = ShuffleMask[i]; + } + + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + + PFIndexes[2] * 9 + PFIndexes[3]; + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + + auto BuildRev = [&MIB, &MRI](Register OpLHS) { + LLT Ty = MRI.getType(OpLHS); + unsigned Opcode = Ty.getScalarSizeInBits() == 32 ? AArch64::G_REV64 + : Ty.getScalarSizeInBits() == 16 ? AArch64::G_REV32 + : AArch64::G_REV16; + return MIB.buildInstr(Opcode, {Ty}, {OpLHS}).getReg(0); + }; + auto BuildDup = [&MIB, &MRI](Register OpLHS, unsigned Lane) { + LLT Ty = MRI.getType(OpLHS); + unsigned Opcode; + if (Ty.getScalarSizeInBits() == 8) + Opcode = AArch64::G_DUPLANE8; + else if (Ty.getScalarSizeInBits() == 16) + Opcode = AArch64::G_DUPLANE16; + else if (Ty.getScalarSizeInBits() == 32) + Opcode = AArch64::G_DUPLANE32; + else if (Ty.getScalarSizeInBits() == 64) + Opcode = AArch64::G_DUPLANE64; + else + llvm_unreachable("Invalid vector element type?"); + + if (Ty.getSizeInBits() == 64) + OpLHS = MIB.buildConcatVectors( + Ty.changeElementCount(Ty.getElementCount() * 2), + {OpLHS, MIB.buildUndef(Ty).getReg(0)}) + .getReg(0); + Register LaneR = MIB.buildConstant(LLT::scalar(64), Lane).getReg(0); + return MIB.buildInstr(Opcode, {Ty}, {OpLHS, LaneR}).getReg(0); + }; + auto BuildExt = [&MIB, &MRI](Register OpLHS, Register OpRHS, unsigned Imm) { + LLT Ty = MRI.getType(OpLHS); + Imm = Imm * Ty.getScalarSizeInBits() / 8; + return MIB + .buildInstr(AArch64::G_EXT, {Ty}, + {OpLHS, OpRHS, MIB.buildConstant(LLT::scalar(64), Imm)}) + .getReg(0); + }; + auto BuildZipLike = [&MIB, &MRI](unsigned OpNum, Register OpLHS, + Register OpRHS) { + LLT Ty = MRI.getType(OpLHS); + unsigned Opc = 0; + switch (OpNum) { + default: + llvm_unreachable("Unexpected perfect shuffle opcode"); + case OP_VUZPL: + Opc = AArch64::G_UZP1; + break; + case OP_VUZPR: + Opc = AArch64::G_UZP2; + break; + case OP_VZIPL: + Opc = AArch64::G_ZIP1; + break; + case OP_VZIPR: + Opc = AArch64::G_ZIP2; + break; + case OP_VTRNL: + Opc = AArch64::G_TRN1; + break; + case OP_VTRNR: + Opc = AArch64::G_TRN2; + } + return MIB.buildInstr(Opc, {Ty}, {OpLHS, OpRHS}).getReg(0); + }; + auto BuildExtractInsert64 = [&MIB, &MRI](Register ExtSrc, unsigned ExtLane, + Register InsSrc, unsigned InsLane) { + LLT Ty = MRI.getType(InsSrc); + if (Ty.getScalarSizeInBits() == 16 && Ty != LLT::fixed_vector(2, 32)) { + ExtSrc = MIB.buildBitcast(LLT::fixed_vector(2, 32), ExtSrc).getReg(0); + InsSrc = MIB.buildBitcast(LLT::fixed_vector(2, 32), InsSrc).getReg(0); + } else if (Ty.getScalarSizeInBits() == 32 && + Ty != LLT::fixed_vector(2, 64)) { + ExtSrc = MIB.buildBitcast(LLT::fixed_vector(2, 64), ExtSrc).getReg(0); + InsSrc = MIB.buildBitcast(LLT::fixed_vector(2, 64), InsSrc).getReg(0); + } + auto Ext = MIB.buildExtractVectorElement( + MRI.getType(ExtSrc).getElementType(), ExtSrc, + MIB.buildConstant(LLT::scalar(64), ExtLane)); + auto Ins = MIB.buildInsertVectorElement( + MRI.getType(ExtSrc), InsSrc, Ext, + MIB.buildConstant(LLT::scalar(64), InsLane)); + return MIB.buildBitcast(Ty, Ins).getReg(0); + }; + auto BuildExtractInsert32 = [&MIB, &MRI](Register ExtSrc, unsigned ExtLane, + Register InsSrc, unsigned InsLane) { + LLT Ty = MRI.getType(InsSrc); + if (Ty.getScalarSizeInBits() == 16 && Ty != LLT::fixed_vector(4, 16)) { + ExtSrc = MIB.buildBitcast(LLT::fixed_vector(2, 32), ExtSrc).getReg(0); + InsSrc = MIB.buildBitcast(LLT::fixed_vector(2, 32), InsSrc).getReg(0); + } + auto Ext = MIB.buildExtractVectorElement( + MRI.getType(ExtSrc).getElementType(), ExtSrc, + MIB.buildConstant(LLT::scalar(64), ExtLane)); + auto Ins = MIB.buildInsertVectorElement( + MRI.getType(ExtSrc), InsSrc, Ext, + MIB.buildConstant(LLT::scalar(64), InsLane)); + if (MRI.getType(Ins.getReg(0)) != Ty) + Ins = MIB.buildBitcast(Ty, Ins); + return Ins.getReg(0); + }; + Register Res = generatePerfectShuffle( + PFTableIndex, LHS, RHS, PFEntry, LHS, RHS, BuildExtractInsert64, + BuildExtractInsert32, BuildRev, BuildDup, BuildExt, BuildZipLike); + MIB.buildCopy(Dst, Res); + MI.eraseFromParent(); +} + /// isVShiftRImm - Check if this is a valid vector for the immediate /// operand of a vector shift right operation. The value must be in the range: /// 1 <= Value <= ElementBits for a right shift. diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir index 7c7689bcb80b5..89bd8f63af02b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir @@ -280,8 +280,16 @@ body: | ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s64) - ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[IVEC]](<4 x s32>), [[DEF]], shufflemask(undef, 0, 0, 3) - ; CHECK-NEXT: $q0 = COPY [[SHUF]](<4 x s32>) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[IVEC]](<4 x s32>), [[C1]](s64) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK-NEXT: [[IVEC1:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[EVEC]](s32), [[C2]](s64) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[IVEC]](<4 x s32>), [[C3]](s64) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[IVEC2:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC1]], [[EVEC1]](s32), [[C4]](s64) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY [[IVEC2]](<4 x s32>) + ; CHECK-NEXT: $q0 = COPY [[COPY1]](<4 x s32>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %0:_(s32) = COPY $s0 %2:_(<4 x s32>) = G_IMPLICIT_DEF @@ -394,35 +402,26 @@ body: | ; The G_SHUFFLE_VECTOR is fed by a G_BUILD_VECTOR, and the 0th input ; operand is not a constant. We should get a G_DUP. ; - ; CHECK-LABEL: name: build_vector + ; CHECK-LABEL: name: build_vector_rhs ; CHECK: liveins: $w0, $w1, $w2, $w3, $w4 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: %lane_0:_(s32) = COPY $w0 ; CHECK-NEXT: %lane_1:_(s32) = COPY $w1 ; CHECK-NEXT: %b:_(s32) = COPY $w2 ; CHECK-NEXT: %c:_(s32) = COPY $w3 ; CHECK-NEXT: %d:_(s32) = COPY $w4 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %lane_0(s32), [[C]](s64) + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %lane_1(s32), [[C]](s64) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; CHECK-NEXT: [[IVEC1:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], %b(s32), [[C1]](s64) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CHECK-NEXT: [[IVEC2:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC1]], %c(s32), [[C2]](s64) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; CHECK-NEXT: [[IVEC3:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC2]], %d(s32), [[C3]](s64) - ; CHECK-NEXT: %buildvec0:_(<4 x s32>) = COPY [[IVEC3]](<4 x s32>) - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: %buildvec1:_(<4 x s32>) = COPY [[IVEC3]](<4 x s32>) ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[IVEC4:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF1]], %lane_1(s32), [[C4]](s64) - ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; CHECK-NEXT: [[IVEC5:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC4]], %b(s32), [[C5]](s64) - ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; CHECK-NEXT: [[IVEC6:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC5]], %c(s32), [[C6]](s64) - ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 - ; CHECK-NEXT: [[IVEC7:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC6]], %d(s32), [[C7]](s64) - ; CHECK-NEXT: %buildvec1:_(<4 x s32>) = COPY [[IVEC7]](<4 x s32>) - ; CHECK-NEXT: %shuf:_(<4 x s32>) = G_SHUFFLE_VECTOR %buildvec0(<4 x s32>), %buildvec1, shufflemask(4, 4, 4, 4) + ; CHECK-NEXT: [[DUPLANE32_:%[0-9]+]]:_(<4 x s32>) = G_DUPLANE32 %buildvec1, [[C4]](s64) + ; CHECK-NEXT: %shuf:_(<4 x s32>) = COPY [[DUPLANE32_]](<4 x s32>) ; CHECK-NEXT: $q0 = COPY %shuf(<4 x s32>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %lane_0:_(s32) = COPY $w0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-uzp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-uzp.mir index d1d5c6c29ba0d..d466df2a55c53 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-uzp.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-uzp.mir @@ -67,8 +67,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 - ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], shufflemask(0, 1, 4, 6) - ; CHECK-NEXT: $q0 = COPY [[SHUF]](<4 x s32>) + ; CHECK-NEXT: [[ZIP1_:%[0-9]+]]:_(<4 x s32>) = G_ZIP1 [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[UZP1_:%[0-9]+]]:_(<4 x s32>) = G_UZP1 [[ZIP1_]], [[COPY1]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY [[UZP1_]](<4 x s32>) + ; CHECK-NEXT: $q0 = COPY [[COPY2]](<4 x s32>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %0:_(<4 x s32>) = COPY $q0 %1:_(<4 x s32>) = COPY $q1 @@ -92,8 +94,13 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 - ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], shufflemask(1, 4, 5, 7) - ; CHECK-NEXT: $q0 = COPY [[SHUF]](<4 x s32>) + ; CHECK-NEXT: [[UZP2_:%[0-9]+]]:_(<4 x s32>) = G_UZP2 [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s32>), [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[UZP2_]], [[EVEC]](s32), [[C1]](s64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY [[IVEC]](<4 x s32>) + ; CHECK-NEXT: $q0 = COPY [[COPY2]](<4 x s32>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %0:_(<4 x s32>) = COPY $q0 %1:_(<4 x s32>) = COPY $q1 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-zip.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-zip.mir index bcf088287f46a..afd5eaa8867bc 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-zip.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-zip.mir @@ -220,8 +220,13 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 - ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], shufflemask(3, 4, 1, 5) - ; CHECK-NEXT: $q0 = COPY [[SHUF]](<4 x s32>) + ; CHECK-NEXT: [[ZIP1_:%[0-9]+]]:_(<4 x s32>) = G_ZIP1 [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[ZIP1_]], [[EVEC]](s32), [[C1]](s64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY [[IVEC]](<4 x s32>) + ; CHECK-NEXT: $q0 = COPY [[COPY2]](<4 x s32>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %0:_(<4 x s32>) = COPY $q0 %1:_(<4 x s32>) = COPY $q1 diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll index fb6575cc0ee83..ebc626669fecf 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -196,23 +196,21 @@ define void @matrix_mul_double_shuffle(i32 %N, ptr nocapture %C, ptr nocapture r ; ; CHECK-GI-LABEL: matrix_mul_double_shuffle: ; CHECK-GI: // %bb.0: // %vector.header -; CHECK-GI-NEXT: and w9, w3, #0xffff -; CHECK-GI-NEXT: adrp x8, .LCPI2_0 -; CHECK-GI-NEXT: dup v0.4s, w9 -; CHECK-GI-NEXT: mov w9, w0 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] -; CHECK-GI-NEXT: and x8, x9, #0xfffffff8 +; CHECK-GI-NEXT: and w8, w3, #0xffff +; CHECK-GI-NEXT: dup v0.4s, w8 +; CHECK-GI-NEXT: mov w8, w0 +; CHECK-GI-NEXT: and x8, x8, #0xfffffff8 ; CHECK-GI-NEXT: .LBB2_1: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldrh w9, [x2], #16 ; CHECK-GI-NEXT: subs x8, x8, #8 -; CHECK-GI-NEXT: mov v2.s[0], w9 +; CHECK-GI-NEXT: mov v1.s[0], w9 ; CHECK-GI-NEXT: mov w9, w0 ; CHECK-GI-NEXT: add w0, w0, #8 ; CHECK-GI-NEXT: lsl x9, x9, #2 -; CHECK-GI-NEXT: tbl v2.16b, { v2.16b, v3.16b }, v1.16b -; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s -; CHECK-GI-NEXT: str q2, [x1, x9] +; CHECK-GI-NEXT: mov v1.d[1], v1.d[0] +; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: str q1, [x1, x9] ; CHECK-GI-NEXT: b.ne .LBB2_1 ; CHECK-GI-NEXT: // %bb.2: // %for.end12 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll index 4c28ea7592202..76993ef2524dc 100644 --- a/llvm/test/CodeGen/AArch64/arm64-dup.ll +++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll @@ -442,85 +442,45 @@ entry: ; Also test the DUP path in the PerfectShuffle generator. define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { -; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4i16: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: trn1.4h v0, v0, v0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-SD-NEXT: mov.s v0[1], v1[0] -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: adrp x8, .LCPI34_0 -; CHECK-GI-NEXT: mov.d v0[1], v1[0] -; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI34_0] -; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1 -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_perfectshuffle_dupext_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: trn1.4h v0, v0, v0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov.s v0[1], v1[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret %r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> ret <4 x i16> %r } define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind { -; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4f16: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: trn1.4h v0, v0, v0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-SD-NEXT: mov.s v0[1], v1[0] -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: adrp x8, .LCPI35_0 -; CHECK-GI-NEXT: mov.d v0[1], v1[0] -; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI35_0] -; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1 -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_perfectshuffle_dupext_v4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: trn1.4h v0, v0, v0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov.s v0[1], v1[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret %r = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> ret <4 x half> %r } define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { -; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4i32: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: trn1.4s v0, v0, v0 -; CHECK-SD-NEXT: mov.d v0[1], v1[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i32: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI36_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_perfectshuffle_dupext_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: trn1.4s v0, v0, v0 +; CHECK-NEXT: mov.d v0[1], v1[0] +; CHECK-NEXT: ret %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %r } define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float> %b) nounwind { -; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4f32: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: trn1.4s v0, v0, v0 -; CHECK-SD-NEXT: mov.d v0[1], v1[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f32: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI37_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI37_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_perfectshuffle_dupext_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: trn1.4s v0, v0, v0 +; CHECK-NEXT: mov.d v0[1], v1[0] +; CHECK-NEXT: ret %r = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %r } @@ -537,15 +497,12 @@ define void @disguised_dup(<4 x float> %x, ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: disguised_dup: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI38_1 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI38_1] -; CHECK-GI-NEXT: adrp x8, .LCPI38_0 -; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI38_0] -; CHECK-GI-NEXT: tbl.16b v2, { v0, v1 }, v2 -; CHECK-GI-NEXT: str q0, [x0] -; CHECK-GI-NEXT: str q2, [x1] +; CHECK-GI-NEXT: ext.16b v1, v0, v0, #4 +; CHECK-GI-NEXT: mov.s v1[2], v0[0] +; CHECK-GI-NEXT: zip2.4s v0, v1, v1 +; CHECK-GI-NEXT: str q1, [x0] +; CHECK-GI-NEXT: ext.16b v0, v1, v0, #12 +; CHECK-GI-NEXT: str q0, [x1] ; CHECK-GI-NEXT: ret %shuf = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %dup = shufflevector <4 x float> %shuf, <4 x float> undef, <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll index 2a085dc0e72bf..ee9bccd0dbce2 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -2136,19 +2136,10 @@ entry: } define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 { -; CHECK-SD-LABEL: test_concat_v4i32_v4i32_v4i32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_concat_v4i32_v4i32_v4i32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: adrp x8, .LCPI134_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI134_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> ret <4 x i32> %vecinit6 @@ -2163,13 +2154,11 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v4i32_v2i32_v4i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov v2.16b, v1.16b ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: adrp x8, .LCPI135_0 -; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI135_0] -; CHECK-GI-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b +; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] +; CHECK-GI-NEXT: mov v2.d[1], v1.d[0] +; CHECK-GI-NEXT: mov v0.16b, v2.16b ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <2 x i32> %x, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll index 6bdd5f998a3b9..9b9b38347ed70 100644 --- a/llvm/test/CodeGen/AArch64/arm64-rev.ll +++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll @@ -498,10 +498,9 @@ define void @float_vrev64(ptr nocapture %source, ptr nocapture %dest) nounwind n ; CHECK-GI-LABEL: float_vrev64: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: movi d0, #0000000000000000 -; CHECK-GI-NEXT: adrp x8, .LCPI36_0 ; CHECK-GI-NEXT: ldr q1, [x0] -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_0] -; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-GI-NEXT: dup.4s v0, v0[0] +; CHECK-GI-NEXT: mov.s v0[1], v1[3] ; CHECK-GI-NEXT: str q0, [x1, #176] ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll index 5e5fdd6d31705..5e91ba9c3092a 100644 --- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll @@ -919,15 +919,12 @@ define i32 @extract_v4i32_shuffle(<4 x i32> %a, <4 x i32> %b, i32 %c) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sub sp, sp, #16 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 -; CHECK-GI-NEXT: adrp x8, .LCPI35_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: mov x9, sp -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI35_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: uzp1 v1.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: mov w8, w0 +; CHECK-GI-NEXT: mov x9, sp ; CHECK-GI-NEXT: and x8, x8, #0x3 -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-GI-NEXT: str q0, [sp] +; CHECK-GI-NEXT: mov v1.s[3], v0.s[3] +; CHECK-GI-NEXT: str q1, [sp] ; CHECK-GI-NEXT: ldr w0, [x9, x8, lsl #2] ; CHECK-GI-NEXT: add sp, sp, #16 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll index fb65a748c865f..deaea6ad2eae8 100644 --- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -1432,10 +1432,8 @@ define <4 x i16> @vselect_equivalent_shuffle_v4i16(<4 x i16> %a, <4 x i16> %b) { ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: adrp x8, .LCPI95_0 -; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI95_0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b +; CHECK-GI-NEXT: mov v0.h[2], v1.h[1] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %c = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> @@ -1451,11 +1449,8 @@ define <4 x i32> @vselect_equivalent_shuffle_v4i32(<4 x i32> %a, <4 x i32> %b) { ; ; CHECK-GI-LABEL: vselect_equivalent_shuffle_v4i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI96_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI96_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: mov v0.s[2], v1.s[1] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] ; CHECK-GI-NEXT: ret %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %c diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll index 4c8f0c9c446f5..f35f586ea8e04 100644 --- a/llvm/test/CodeGen/AArch64/shufflevector.ll +++ b/llvm/test/CodeGen/AArch64/shufflevector.ll @@ -298,11 +298,9 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){ ; CHECK-GI-LABEL: shufflevector_v2i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h -; CHECK-GI-NEXT: adrp x8, .LCPI19_0 ; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h -; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI19_0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b +; CHECK-GI-NEXT: zip1 v1.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: trn2 v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret %c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> @@ -358,12 +356,9 @@ define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) { ; ; CHECK-GI-LABEL: shufflevector_v8i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI22_0 -; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 ; CHECK-GI-NEXT: uzp2 v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI22_0] -; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-GI-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v4.16b +; CHECK-GI-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mov v1.s[3], v3.s[3] ; CHECK-GI-NEXT: ret %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %c @@ -620,22 +615,11 @@ define <7 x i8> @shufflevector_v7i8(<7 x i8> %a, <7 x i8> %b) { } define <3 x i16> @shufflevector_v3i16(<3 x i16> %a, <3 x i16> %b) { -; CHECK-SD-LABEL: shufflevector_v3i16: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: zip1 v1.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: zip2 v0.4h, v1.4h, v0.4h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: shufflevector_v3i16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: adrp x8, .LCPI37_0 -; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI37_0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: shufflevector_v3i16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v1.4h, v0.4h, v1.4h +; CHECK-NEXT: zip2 v0.4h, v1.4h, v0.4h +; CHECK-NEXT: ret %c = shufflevector <3 x i16> %a, <3 x i16> %b, <3 x i32> ret <3 x i16> %c } @@ -663,20 +647,11 @@ define <7 x i16> @shufflevector_v7i16(<7 x i16> %a, <7 x i16> %b) { } define <3 x i32> @shufflevector_v3i32(<3 x i32> %a, <3 x i32> %b) { -; CHECK-SD-LABEL: shufflevector_v3i32: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: zip1 v1.4s, v0.4s, v1.4s -; CHECK-SD-NEXT: zip2 v0.4s, v1.4s, v0.4s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: shufflevector_v3i32: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI39_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI39_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: shufflevector_v3i32: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v1.4s, v0.4s, v1.4s +; CHECK-NEXT: zip2 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ret %c = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32> ret <3 x i32> %c }