From 578aab29ad9b185ffa5ffad7f6ee2e22b796f9b5 Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 17 May 2025 12:17:34 +0100 Subject: [PATCH] [GlobalISel] Add G_SHUFFLE_VECTOR computeKnownBits The code is taken from SelectionDAG::computeKnownBits. --- .../CodeGen/GlobalISel/GISelValueTracking.cpp | 28 +++++++ .../AArch64/GlobalISel/knownbits-shuffle.mir | 71 +++++++++++++++++ llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll | 49 ++++-------- .../AArch64/aarch64-matrix-umull-smull.ll | 78 ++++++++----------- llvm/test/CodeGen/AArch64/aarch64-smull.ll | 32 +++----- 5 files changed, 158 insertions(+), 100 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shuffle.mir diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp index 589936b6c260f..748ecbd767c3e 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp @@ -14,6 +14,7 @@ #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -629,6 +630,33 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known, Known.Zero.setBitsFrom(LowBits); break; } + case TargetOpcode::G_SHUFFLE_VECTOR: { + APInt DemandedLHS, DemandedRHS; + // Collect the known bits that are shared by every vector element referenced + // by the shuffle. + unsigned NumElts = MRI.getType(MI.getOperand(1).getReg()).getNumElements(); + if (!getShuffleDemandedElts(NumElts, MI.getOperand(3).getShuffleMask(), + DemandedElts, DemandedLHS, DemandedRHS)) + break; + + // Known bits are the values that are shared by every demanded element. + Known.Zero.setAllBits(); + Known.One.setAllBits(); + if (!!DemandedLHS) { + computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedLHS, + Depth + 1); + Known = Known.intersectWith(Known2); + } + // If we don't know any bits, early out. + if (Known.isUnknown()) + break; + if (!!DemandedRHS) { + computeKnownBitsImpl(MI.getOperand(2).getReg(), Known2, DemandedRHS, + Depth + 1); + Known = Known.intersectWith(Known2); + } + break; + } } LLVM_DEBUG(dumpResult(MI, Known, Depth)); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shuffle.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shuffle.mir new file mode 100644 index 0000000000000..a70a44790035a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shuffle.mir @@ -0,0 +1,71 @@ +# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple aarch64 -passes="print" %s -filetype=null 2>&1 | FileCheck %s + +--- +name: lane0 +body: | + bb.1: + ; CHECK-LABEL: name: @lane0 + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + %0:_(<2 x s8>) = COPY $h0 + %1:_(<2 x s8>) = COPY $h1 + %2:_(<2 x s8>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0, 0) +... +--- +name: zext_known +body: | + bb.1: + ; CHECK-LABEL: name: @zext_known + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:00000000???????? SignBits:8 + ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8 + %0:_(<2 x s8>) = COPY $h0 + %1:_(<2 x s16>) = COPY $s1 + %2:_(<2 x s16>) = G_ZEXT %0 + %3:_(<2 x s16>) = G_SHUFFLE_VECTOR %2, %1, shufflemask(0, 0) +... +--- +name: zext_unknown +body: | + bb.1: + ; CHECK-LABEL: name: @zext_unknown + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:00000000???????? SignBits:8 + ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1 + %0:_(<2 x s8>) = COPY $h0 + %1:_(<2 x s16>) = COPY $s1 + %2:_(<2 x s16>) = G_ZEXT %0 + %3:_(<2 x s16>) = G_SHUFFLE_VECTOR %2, %1, shufflemask(0, 2) +... +--- +name: sext_known +body: | + bb.1: + ; CHECK-LABEL: name: @sext_known + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:9 + ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1 + %0:_(<2 x s8>) = COPY $h0 + %1:_(<2 x s16>) = COPY $s1 + %2:_(<2 x s16>) = G_SEXT %0 + %3:_(<2 x s16>) = G_SHUFFLE_VECTOR %2, %1, shufflemask(0, 0) +... +--- +name: sext_unknown +body: | + bb.1: + ; CHECK-LABEL: name: @sext_unknown + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:9 + ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1 + %0:_(<2 x s8>) = COPY $h0 + %1:_(<2 x s16>) = COPY $s1 + %2:_(<2 x s16>) = G_SEXT %0 + %3:_(<2 x s16>) = G_SHUFFLE_VECTOR %2, %1, shufflemask(0, 2) +... diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll index b0d30d89e1e6a..c417a5a75391b 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll @@ -38,9 +38,9 @@ define <8 x i16> @dupzext_v8i8_v8i16(i8 %src, <8 x i8> %b) { ; CHECK-GI-LABEL: dupzext_v8i8_v8i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: and w8, w0, #0xff -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: dup v1.8h, w8 -; CHECK-GI-NEXT: mul v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: xtn v1.8b, v1.8h +; CHECK-GI-NEXT: umull v0.8h, v1.8b, v0.8b ; CHECK-GI-NEXT: ret entry: %in = zext i8 %src to i16 @@ -84,9 +84,9 @@ define <4 x i32> @dupzext_v4i16_v4i32(i16 %src, <4 x i16> %b) { ; CHECK-GI-LABEL: dupzext_v4i16_v4i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: and w8, w0, #0xffff -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: dup v1.4s, w8 -; CHECK-GI-NEXT: mul v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: xtn v1.4h, v1.4s +; CHECK-GI-NEXT: umull v0.4s, v1.4h, v0.4h ; CHECK-GI-NEXT: ret entry: %in = zext i16 %src to i32 @@ -138,16 +138,9 @@ define <2 x i64> @dupzext_v2i32_v2i64(i32 %src, <2 x i32> %b) { ; CHECK-GI-LABEL: dupzext_v2i32_v2i64: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov w8, w0 -; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: dup v1.2d, x8 -; CHECK-GI-NEXT: fmov x9, d0 -; CHECK-GI-NEXT: mov x11, v0.d[1] -; CHECK-GI-NEXT: fmov x8, d1 -; CHECK-GI-NEXT: mov x10, v1.d[1] -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: mul x9, x10, x11 -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: xtn v1.2s, v1.2d +; CHECK-GI-NEXT: umull v0.2d, v1.2s, v0.2s ; CHECK-GI-NEXT: ret entry: %in = zext i32 %src to i64 @@ -169,16 +162,9 @@ define <2 x i32> @dupzext_v2i32_v2i64_trunc(i32 %src, <2 x i32> %b) { ; CHECK-GI-LABEL: dupzext_v2i32_v2i64_trunc: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov w8, w0 -; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: dup v1.2d, x8 -; CHECK-GI-NEXT: fmov x9, d0 -; CHECK-GI-NEXT: mov x11, v0.d[1] -; CHECK-GI-NEXT: fmov x8, d1 -; CHECK-GI-NEXT: mov x10, v1.d[1] -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: mul x9, x10, x11 -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: xtn v1.2s, v1.2d +; CHECK-GI-NEXT: umull v0.2d, v1.2s, v0.2s ; CHECK-GI-NEXT: xtn v0.2s, v0.2d ; CHECK-GI-NEXT: ret entry: @@ -240,14 +226,9 @@ define <2 x i64> @dupzext_v2i16_v2i64(i16 %src, <2 x i16> %b) { ; CHECK-GI-NEXT: and x8, x0, #0xffff ; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-GI-NEXT: dup v1.2d, x8 -; CHECK-GI-NEXT: fmov x8, d1 -; CHECK-GI-NEXT: fmov x9, d0 -; CHECK-GI-NEXT: mov x10, v1.d[1] -; CHECK-GI-NEXT: mov x11, v0.d[1] -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: mul x9, x10, x11 -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: xtn v1.2s, v1.2d +; CHECK-GI-NEXT: xtn v0.2s, v0.2d +; CHECK-GI-NEXT: umull v0.2d, v1.2s, v0.2s ; CHECK-GI-NEXT: ret entry: %in = zext i16 %src to i64 @@ -491,10 +472,10 @@ define <8 x i16> @shufzext_v8i8_v8i16(<8 x i8> %src, <8 x i8> %b) { ; CHECK-GI-LABEL: shufzext_v8i8_v8i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-GI-NEXT: rev64 v0.8h, v0.8h ; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-GI-NEXT: ret entry: %in = zext <8 x i8> %src to <8 x i16> @@ -545,8 +526,8 @@ define <8 x i16> @shufzext_v8i8_v8i16_twoin(<8 x i8> %src1, <8 x i8> %src2, <8 x ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-GI-NEXT: trn1 v0.8h, v0.8h, v1.8h -; CHECK-GI-NEXT: ushll v1.8h, v2.8b, #0 -; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: umull v0.8h, v0.8b, v2.8b ; CHECK-GI-NEXT: ret entry: %in1 = zext <8 x i8> %src1 to <8 x i16> diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll index fb6575cc0ee83..eee1ec0b37315 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -28,6 +28,7 @@ define void @matrix_mul_unsigned(i32 %N, ptr nocapture %C, ptr nocapture readonl ; CHECK-GI-NEXT: dup v0.4s, w8 ; CHECK-GI-NEXT: mov w8, w0 ; CHECK-GI-NEXT: and x8, x8, #0xfffffff8 +; CHECK-GI-NEXT: xtn v0.4h, v0.4s ; CHECK-GI-NEXT: .LBB0_1: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1 @@ -35,10 +36,8 @@ define void @matrix_mul_unsigned(i32 %N, ptr nocapture %C, ptr nocapture readonl ; CHECK-GI-NEXT: ldp d1, d2, [x9] ; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2 ; CHECK-GI-NEXT: add w0, w0, #8 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: umull v2.4s, v0.4h, v2.4h ; CHECK-GI-NEXT: stp q1, q2, [x9] ; CHECK-GI-NEXT: b.ne .LBB0_1 ; CHECK-GI-NEXT: // %bb.2: // %for.end12 @@ -478,22 +477,21 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-GI-NEXT: mov x12, x8 ; CHECK-GI-NEXT: .LBB4_3: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-GI-NEXT: ldp q0, q1, [x11, #-16] ; CHECK-GI-NEXT: and w13, w1, #0xffff -; CHECK-GI-NEXT: dup v2.4s, w13 +; CHECK-GI-NEXT: ldp q1, q2, [x11, #-16] +; CHECK-GI-NEXT: dup v0.4s, w13 ; CHECK-GI-NEXT: mov x13, x10 ; CHECK-GI-NEXT: subs x12, x12, #16 ; CHECK-GI-NEXT: add x11, x11, #32 -; CHECK-GI-NEXT: ushll v3.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-GI-NEXT: ushll v4.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-GI-NEXT: mul v3.4s, v2.4s, v3.4s -; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s -; CHECK-GI-NEXT: mul v4.4s, v2.4s, v4.4s -; CHECK-GI-NEXT: mul v1.4s, v2.4s, v1.4s -; CHECK-GI-NEXT: stp q3, q0, [x13, #-32]! -; CHECK-GI-NEXT: stp q4, q1, [x10], #64 +; CHECK-GI-NEXT: mov d3, v1.d[1] +; CHECK-GI-NEXT: mov d4, v2.d[1] +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: umull v3.4s, v0.4h, v3.4h +; CHECK-GI-NEXT: umull v2.4s, v0.4h, v2.4h +; CHECK-GI-NEXT: umull v0.4s, v0.4h, v4.4h +; CHECK-GI-NEXT: stp q1, q3, [x13, #-32]! +; CHECK-GI-NEXT: stp q2, q0, [x10], #64 ; CHECK-GI-NEXT: b.ne .LBB4_3 ; CHECK-GI-NEXT: // %bb.4: // %middle.block ; CHECK-GI-NEXT: cmp x8, x9 @@ -775,22 +773,15 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) { ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: mov x8, xzr ; CHECK-GI-NEXT: dup v0.2d, v0.d[1] -; CHECK-GI-NEXT: mov x9, v0.d[1] -; CHECK-GI-NEXT: fmov x10, d0 +; CHECK-GI-NEXT: xtn v0.2s, v0.2d ; CHECK-GI-NEXT: .LBB6_1: // %loop ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: ldr d1, [x0] ; CHECK-GI-NEXT: subs x2, x2, #8 ; CHECK-GI-NEXT: add x8, x8, #8 -; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: fmov x11, d0 -; CHECK-GI-NEXT: mov x12, v0.d[1] -; CHECK-GI-NEXT: mul x11, x11, x10 -; CHECK-GI-NEXT: mul x12, x12, x9 -; CHECK-GI-NEXT: mov v0.d[0], x11 -; CHECK-GI-NEXT: mov v0.d[1], x12 -; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #15 -; CHECK-GI-NEXT: str d0, [x0], #32 +; CHECK-GI-NEXT: umull v1.2d, v1.2s, v0.2s +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #15 +; CHECK-GI-NEXT: str d1, [x0], #32 ; CHECK-GI-NEXT: b.ne .LBB6_1 ; CHECK-GI-NEXT: // %bb.2: // %exit ; CHECK-GI-NEXT: ret @@ -917,13 +908,14 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: mov x8, xzr +; CHECK-GI-NEXT: dup v0.8h, v0.h[0] +; CHECK-GI-NEXT: xtn v0.8b, v0.8h ; CHECK-GI-NEXT: .LBB8_1: // %loop ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldr d1, [x0] ; CHECK-GI-NEXT: subs x2, x2, #8 ; CHECK-GI-NEXT: add x8, x8, #8 -; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-GI-NEXT: mul v1.8h, v1.8h, v0.h[0] +; CHECK-GI-NEXT: umull v1.8h, v1.8b, v0.8b ; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #15 ; CHECK-GI-NEXT: xtn v1.8b, v1.8h ; CHECK-GI-NEXT: str d1, [x0], #32 @@ -1046,6 +1038,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea ; CHECK-GI-NEXT: dup v0.4s, w8 ; CHECK-GI-NEXT: mov w8, w0 ; CHECK-GI-NEXT: and x8, x8, #0xfffffff8 +; CHECK-GI-NEXT: xtn v0.4h, v0.4s ; CHECK-GI-NEXT: .LBB10_1: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1 @@ -1053,10 +1046,8 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea ; CHECK-GI-NEXT: ldp d1, d2, [x9] ; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2 ; CHECK-GI-NEXT: add w0, w0, #8 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: umull v2.4s, v0.4h, v2.4h ; CHECK-GI-NEXT: stp q1, q2, [x9] ; CHECK-GI-NEXT: b.ne .LBB10_1 ; CHECK-GI-NEXT: // %bb.2: // %for.end12 @@ -1135,6 +1126,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt ; CHECK-GI-NEXT: dup v0.4s, w8 ; CHECK-GI-NEXT: mov w8, w0 ; CHECK-GI-NEXT: and x8, x8, #0xfffffff0 +; CHECK-GI-NEXT: xtn v0.4h, v0.4s ; CHECK-GI-NEXT: .LBB11_1: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1 @@ -1143,16 +1135,14 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt ; CHECK-GI-NEXT: ldur q2, [x9, #8] ; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2 ; CHECK-GI-NEXT: add w0, w0, #16 -; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-GI-NEXT: ushll v4.4s, v2.4h, #0 -; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-GI-NEXT: mul v3.4s, v0.4s, v3.4s -; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: mul v4.4s, v0.4s, v4.4s -; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s -; CHECK-GI-NEXT: stp q3, q1, [x9] -; CHECK-GI-NEXT: stp q4, q2, [x9, #32]! +; CHECK-GI-NEXT: mov d3, v1.d[1] +; CHECK-GI-NEXT: mov d4, v2.d[1] +; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: umull v2.4s, v0.4h, v2.4h +; CHECK-GI-NEXT: umull v3.4s, v0.4h, v3.4h +; CHECK-GI-NEXT: umull v4.4s, v0.4h, v4.4h +; CHECK-GI-NEXT: stp q1, q3, [x9] +; CHECK-GI-NEXT: stp q2, q4, [x9, #32]! ; CHECK-GI-NEXT: b.ne .LBB11_1 ; CHECK-GI-NEXT: // %bb.2: // %for.end12 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index 714be46a015f4..951001c84aed0 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -1730,11 +1730,11 @@ define <8 x i32> @umull_and_v8i32_dup(<8 x i16> %src1, i32 %src2) { ; CHECK-GI-LABEL: umull_and_v8i32_dup: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: and w8, w0, #0xff -; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll2 v2.4s, v0.8h, #0 -; CHECK-GI-NEXT: dup v3.4s, w8 -; CHECK-GI-NEXT: mul v0.4s, v1.4s, v3.4s -; CHECK-GI-NEXT: mul v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: dup v1.4s, w8 +; CHECK-GI-NEXT: xtn v1.4h, v1.4s +; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: umull v1.4s, v2.4h, v1.4h ; CHECK-GI-NEXT: ret entry: %in1 = zext <8 x i16> %src1 to <8 x i32> @@ -1819,23 +1819,11 @@ define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) { ; CHECK-GI-LABEL: umull_and_v4i64_dup: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: and x8, x0, #0xff -; CHECK-GI-NEXT: ushll v1.2d, v0.2s, #0 -; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-GI-NEXT: dup v2.2d, x8 -; CHECK-GI-NEXT: fmov x8, d1 -; CHECK-GI-NEXT: fmov x12, d0 -; CHECK-GI-NEXT: mov x10, v1.d[1] -; CHECK-GI-NEXT: fmov x9, d2 -; CHECK-GI-NEXT: mov x11, v2.d[1] -; CHECK-GI-NEXT: mov x13, v0.d[1] -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: mul x9, x12, x9 -; CHECK-GI-NEXT: mul x10, x10, x11 -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: mul x11, x13, x11 -; CHECK-GI-NEXT: mov v1.d[0], x9 -; CHECK-GI-NEXT: mov v0.d[1], x10 -; CHECK-GI-NEXT: mov v1.d[1], x11 +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: dup v1.2d, x8 +; CHECK-GI-NEXT: xtn v1.2s, v1.2d +; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: umull v1.2d, v2.2s, v1.2s ; CHECK-GI-NEXT: ret entry: %in1 = zext <4 x i32> %src1 to <4 x i64>