diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c8e8a1e612e0a..ad737f75ed84f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17079,6 +17079,16 @@ bool AArch64TargetLowering::lowerInterleavedLoad( if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) return false; + // Check if the interleave is a zext(shuffle), that can be better optimized + // into shift / and masks. For the moment we do this just for uitofp (not + // zext) to avoid issues with widening instructions. + if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) { + return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) && + SI->getType()->getScalarSizeInBits() * 4 == + SI->user_back()->getType()->getScalarSizeInBits(); + })) + return false; + unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable); auto *FVTy = cast(VTy); diff --git a/llvm/test/CodeGen/AArch64/zext-shuffle.ll b/llvm/test/CodeGen/AArch64/zext-shuffle.ll index 6d25c874a2893..2965996ddcb02 100644 --- a/llvm/test/CodeGen/AArch64/zext-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/zext-shuffle.ll @@ -472,59 +472,61 @@ define <8 x double> @uitofp_fadd(<32 x i16> %l) { define <8 x double> @uitofp_load_fadd(ptr %p) { ; CHECK-LABEL: uitofp_load_fadd: ; CHECK: // %bb.0: -; CHECK-NEXT: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] -; CHECK-NEXT: ushll2 v4.4s, v0.8h, #0 -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v6.4s, v1.8h, #0 -; CHECK-NEXT: ushll v7.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v16.4s, v2.8h, #0 -; CHECK-NEXT: ushll v17.4s, v2.4h, #0 -; CHECK-NEXT: ushll2 v18.4s, v3.8h, #0 -; CHECK-NEXT: ushll v0.4s, v3.4h, #0 -; CHECK-NEXT: ushll2 v1.2d, v4.4s, #0 -; CHECK-NEXT: ushll2 v2.2d, v5.4s, #0 -; CHECK-NEXT: ushll v3.2d, v4.2s, #0 -; CHECK-NEXT: ushll v4.2d, v5.2s, #0 -; CHECK-NEXT: ushll2 v5.2d, v6.4s, #0 -; CHECK-NEXT: ushll2 v19.2d, v7.4s, #0 -; CHECK-NEXT: ushll v6.2d, v6.2s, #0 -; CHECK-NEXT: ushll v7.2d, v7.2s, #0 -; CHECK-NEXT: ushll2 v20.2d, v16.4s, #0 -; CHECK-NEXT: ushll2 v21.2d, v17.4s, #0 -; CHECK-NEXT: ushll v16.2d, v16.2s, #0 -; CHECK-NEXT: ushll v17.2d, v17.2s, #0 -; CHECK-NEXT: ushll v22.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v23.2d, v18.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v18.2d, v18.2s, #0 -; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: ucvtf v2.2d, v2.2d -; CHECK-NEXT: ucvtf v3.2d, v3.2d -; CHECK-NEXT: ucvtf v4.2d, v4.2d -; CHECK-NEXT: ucvtf v5.2d, v5.2d +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: movi v0.2d, #0x0000000000ffff +; CHECK-NEXT: ldp q3, q4, [x0, #32] +; CHECK-NEXT: ushr v5.2d, v1.2d, #16 +; CHECK-NEXT: ushr v6.2d, v2.2d, #16 +; CHECK-NEXT: ushr v20.2d, v1.2d, #32 +; CHECK-NEXT: ushr v7.2d, v3.2d, #16 +; CHECK-NEXT: ushr v17.2d, v4.2d, #16 +; CHECK-NEXT: ushr v22.2d, v2.2d, #32 +; CHECK-NEXT: ushr v23.2d, v3.2d, #32 +; CHECK-NEXT: ushr v24.2d, v4.2d, #32 +; CHECK-NEXT: and v16.16b, v1.16b, v0.16b +; CHECK-NEXT: and v18.16b, v2.16b, v0.16b +; CHECK-NEXT: and v19.16b, v3.16b, v0.16b +; CHECK-NEXT: and v21.16b, v4.16b, v0.16b +; CHECK-NEXT: and v5.16b, v5.16b, v0.16b +; CHECK-NEXT: and v6.16b, v6.16b, v0.16b +; CHECK-NEXT: and v7.16b, v7.16b, v0.16b +; CHECK-NEXT: and v17.16b, v17.16b, v0.16b +; CHECK-NEXT: and v20.16b, v20.16b, v0.16b +; CHECK-NEXT: and v22.16b, v22.16b, v0.16b +; CHECK-NEXT: and v23.16b, v23.16b, v0.16b +; CHECK-NEXT: and v0.16b, v24.16b, v0.16b +; CHECK-NEXT: ushr v1.2d, v1.2d, #48 +; CHECK-NEXT: ushr v2.2d, v2.2d, #48 +; CHECK-NEXT: ushr v3.2d, v3.2d, #48 +; CHECK-NEXT: ushr v4.2d, v4.2d, #48 +; CHECK-NEXT: ucvtf v16.2d, v16.2d +; CHECK-NEXT: ucvtf v18.2d, v18.2d ; CHECK-NEXT: ucvtf v19.2d, v19.2d +; CHECK-NEXT: ucvtf v21.2d, v21.2d +; CHECK-NEXT: ucvtf v5.2d, v5.2d ; CHECK-NEXT: ucvtf v6.2d, v6.2d ; CHECK-NEXT: ucvtf v7.2d, v7.2d -; CHECK-NEXT: ucvtf v20.2d, v20.2d -; CHECK-NEXT: ucvtf v21.2d, v21.2d -; CHECK-NEXT: ucvtf v16.2d, v16.2d ; CHECK-NEXT: ucvtf v17.2d, v17.2d +; CHECK-NEXT: ucvtf v20.2d, v20.2d ; CHECK-NEXT: ucvtf v22.2d, v22.2d ; CHECK-NEXT: ucvtf v23.2d, v23.2d ; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: ucvtf v18.2d, v18.2d -; CHECK-NEXT: fadd v1.2d, v1.2d, v5.2d -; CHECK-NEXT: fadd v4.2d, v4.2d, v7.2d -; CHECK-NEXT: fadd v6.2d, v3.2d, v6.2d -; CHECK-NEXT: fadd v2.2d, v2.2d, v19.2d -; CHECK-NEXT: fadd v3.2d, v17.2d, v22.2d -; CHECK-NEXT: fadd v5.2d, v16.2d, v18.2d -; CHECK-NEXT: fadd v7.2d, v21.2d, v0.2d -; CHECK-NEXT: fadd v16.2d, v20.2d, v23.2d -; CHECK-NEXT: fadd v0.2d, v4.2d, v3.2d -; CHECK-NEXT: fadd v3.2d, v1.2d, v16.2d -; CHECK-NEXT: fadd v1.2d, v2.2d, v7.2d -; CHECK-NEXT: fadd v2.2d, v6.2d, v5.2d +; CHECK-NEXT: ucvtf v1.2d, v1.2d +; CHECK-NEXT: ucvtf v2.2d, v2.2d +; CHECK-NEXT: ucvtf v3.2d, v3.2d +; CHECK-NEXT: ucvtf v4.2d, v4.2d +; CHECK-NEXT: fadd v6.2d, v18.2d, v6.2d +; CHECK-NEXT: fadd v5.2d, v16.2d, v5.2d +; CHECK-NEXT: fadd v17.2d, v21.2d, v17.2d +; CHECK-NEXT: fadd v7.2d, v19.2d, v7.2d +; CHECK-NEXT: fadd v1.2d, v20.2d, v1.2d +; CHECK-NEXT: fadd v3.2d, v23.2d, v3.2d +; CHECK-NEXT: fadd v2.2d, v22.2d, v2.2d +; CHECK-NEXT: fadd v4.2d, v0.2d, v4.2d +; CHECK-NEXT: fadd v0.2d, v5.2d, v1.2d +; CHECK-NEXT: fadd v1.2d, v6.2d, v2.2d +; CHECK-NEXT: fadd v2.2d, v7.2d, v3.2d +; CHECK-NEXT: fadd v3.2d, v17.2d, v4.2d ; CHECK-NEXT: ret %l = load <32 x i16>, ptr %p %s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32>