Skip to content

Commit 19987ea

Browse files
committed
[AArch64] Improve lowering for scalable masked deinterleaving loads
For IR like this: %mask = ... @llvm.vector.interleave2(<vscale x 16 x i1> %a, <vscale x 16 x i1> %a) %vec = ... @llvm.masked.load(..., <vscale x 32 x i1> %mask, ...) %dvec = ... @llvm.vector.deinterleave2(<vscale x 32 x i8> %vec) where we're deinterleaving a wide masked load of the supported type and with an interleaved mask we can lower this directly to a ld2b instruction. Similarly we can also support other variants of ld2 and ld4. This PR adds a DAG combine to spot such patterns and lower to ld2X or ld4X variants accordingly, whilst being careful to ensure the masked load is only used by the deinterleave intrinsic.
1 parent 67a4661 commit 19987ea

File tree

2 files changed

+137
-139
lines changed

2 files changed

+137
-139
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1179,6 +1179,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
11791179
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
11801180

11811181
setTargetDAGCombine(ISD::SHL);
1182+
setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE);
11821183

11831184
// In case of strict alignment, avoid an excessive number of byte wide stores.
11841185
MaxStoresPerMemsetOptSize = 8;
@@ -27015,6 +27016,120 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
2701527016
return NVCAST;
2701627017
}
2701727018

27019+
static SDValue performVectorDeinterleaveCombine(
27020+
SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
27021+
unsigned NumParts = N->getNumOperands();
27022+
if (NumParts != 2 && NumParts != 4)
27023+
return SDValue();
27024+
27025+
EVT SubVecTy = N->getValueType(0);
27026+
27027+
// At the moment we're unlikely to see a fixed-width vector deinterleave as
27028+
// we usually generate shuffles instead.
27029+
unsigned MinNumElements = SubVecTy.getVectorMinNumElements();
27030+
if (!SubVecTy.isScalableVT() ||
27031+
SubVecTy.getSizeInBits().getKnownMinValue() != 128 || MinNumElements == 1)
27032+
return SDValue();
27033+
27034+
// Make sure each input operand is the correct extract_subvector of the same
27035+
// wider vector.
27036+
SDValue Op0 = N->getOperand(0);
27037+
for (unsigned I = 0; I < NumParts; I++) {
27038+
SDValue OpI = N->getOperand(I);
27039+
if (OpI->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
27040+
OpI->getOperand(0) != Op0->getOperand(0))
27041+
return SDValue();
27042+
auto *Idx = cast<ConstantSDNode>(OpI->getOperand(1));
27043+
if (Idx->getZExtValue() != (I * MinNumElements))
27044+
return SDValue();
27045+
}
27046+
27047+
// Normal loads are currently already handled by the InterleavedAccessPass so
27048+
// we don't expect to see them here. Bail out if the masked load has an
27049+
// unexpected number of uses, since we want to avoid a situation where we have
27050+
// both deinterleaving loads and normal loads in the same block. Also, discard
27051+
// masked loads that are extending, indexed, have an unexpected offset or have
27052+
// an unsupported passthru value until we find a valid use case.
27053+
auto MaskedLoad = dyn_cast<MaskedLoadSDNode>(Op0->getOperand(0));
27054+
if (!MaskedLoad || !MaskedLoad->hasNUsesOfValue(NumParts, 0) ||
27055+
MaskedLoad->getExtensionType() != ISD::NON_EXTLOAD ||
27056+
MaskedLoad->getAddressingMode() != ISD::UNINDEXED ||
27057+
!MaskedLoad->getOffset().isUndef() ||
27058+
(!MaskedLoad->getPassThru()->isUndef() &&
27059+
!isZerosVector(MaskedLoad->getPassThru().getNode())))
27060+
return SDValue();
27061+
27062+
// Now prove that the mask is an interleave of identical masks.
27063+
SDValue Mask = MaskedLoad->getMask();
27064+
if (Mask->getOpcode() != ISD::SPLAT_VECTOR &&
27065+
Mask->getOpcode() != ISD::CONCAT_VECTORS)
27066+
return SDValue();
27067+
27068+
SDValue NarrowMask;
27069+
SDLoc DL(N);
27070+
if (Mask->getOpcode() == ISD::CONCAT_VECTORS) {
27071+
if (Mask->getNumOperands() != NumParts)
27072+
return SDValue();
27073+
27074+
// We should be concatenating each sequential result from a
27075+
// VECTOR_INTERLEAVE.
27076+
SDValue InterleaveOp = Mask->getOperand(0);
27077+
if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||
27078+
InterleaveOp->getNumOperands() != NumParts)
27079+
return SDValue();
27080+
27081+
for (unsigned I = 0; I < NumParts; I++) {
27082+
SDValue ConcatOp = Mask->getOperand(I);
27083+
if (ConcatOp.getResNo() != I ||
27084+
ConcatOp.getNode() != InterleaveOp.getNode())
27085+
return SDValue();
27086+
}
27087+
27088+
// Make sure the inputs to the vector interleave are identical.
27089+
for (unsigned I = 1; I < NumParts; I++) {
27090+
if (InterleaveOp->getOperand(I) != InterleaveOp->getOperand(0))
27091+
return SDValue();
27092+
}
27093+
27094+
NarrowMask = InterleaveOp->getOperand(0);
27095+
} else { // ISD::SPLAT_VECTOR
27096+
auto *SplatVal = dyn_cast<ConstantSDNode>(Mask->getOperand(0));
27097+
if (!SplatVal || SplatVal->getZExtValue() != 1)
27098+
return SDValue();
27099+
ElementCount EC = Mask.getValueType().getVectorElementCount();
27100+
assert((EC.getKnownMinValue() % NumParts) == 0 &&
27101+
"Expected element count divisible by number of parts");
27102+
EC = ElementCount::getScalable(EC.getKnownMinValue() / NumParts);
27103+
NarrowMask =
27104+
DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),
27105+
DAG.getConstant(1, DL, MVT::i1));
27106+
}
27107+
27108+
const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret
27109+
: Intrinsic::aarch64_sve_ld4_sret;
27110+
SDValue NewLdOps[] = {MaskedLoad->getChain(),
27111+
DAG.getConstant(IID, DL, MVT::i32), NarrowMask,
27112+
MaskedLoad->getBasePtr()};
27113+
SDValue Res;
27114+
if (NumParts == 2)
27115+
Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
27116+
{SubVecTy, SubVecTy, MVT::Other}, NewLdOps);
27117+
else
27118+
Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
27119+
{SubVecTy, SubVecTy, SubVecTy, SubVecTy, MVT::Other},
27120+
NewLdOps);
27121+
27122+
// We can now generate a structured load!
27123+
SmallVector<SDValue, 4> ResOps(NumParts);
27124+
for (unsigned Idx = 0; Idx < NumParts; Idx++)
27125+
ResOps[Idx] = SDValue(Res.getNode(), Idx);
27126+
27127+
// Replace uses of the original chain result with the new chain result.
27128+
DAG.ReplaceAllUsesOfValueWith(SDValue(MaskedLoad, 1),
27129+
SDValue(Res.getNode(), NumParts));
27130+
return DCI.CombineTo(N, ResOps, false);
27131+
}
27132+
2701827133
/// If the operand is a bitwise AND with a constant RHS, and the shift has a
2701927134
/// constant RHS and is the only use, we can pull it out of the shift, i.e.
2702027135
///
@@ -27083,6 +27198,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
2708327198
default:
2708427199
LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
2708527200
break;
27201+
case ISD::VECTOR_DEINTERLEAVE:
27202+
return performVectorDeinterleaveCombine(N, DCI, DAG);
2708627203
case ISD::VECREDUCE_AND:
2708727204
case ISD::VECREDUCE_OR:
2708827205
case ISD::VECREDUCE_XOR:

llvm/test/CodeGen/AArch64/scalable_masked_deinterleaved_loads.ll

Lines changed: 20 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,7 @@
44
define { <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld2_nxv16i8(<vscale x 16 x i1> %mask, ptr %p) {
55
; CHECK-LABEL: foo_ld2_nxv16i8:
66
; CHECK: // %bb.0:
7-
; CHECK-NEXT: zip1 p1.b, p0.b, p0.b
8-
; CHECK-NEXT: zip2 p0.b, p0.b, p0.b
9-
; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0]
10-
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl]
11-
; CHECK-NEXT: uzp1 z0.b, z2.b, z1.b
12-
; CHECK-NEXT: uzp2 z1.b, z2.b, z1.b
7+
; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0]
138
; CHECK-NEXT: ret
149
%interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
1510
%wide.masked.vec = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8(ptr %p, i32 1, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison)
@@ -20,12 +15,7 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld2_nxv16i8(<vscale x 16
2015
define { <vscale x 8 x i16>, <vscale x 8 x i16> } @foo_ld2_nxv8i16(<vscale x 8 x i1> %mask, ptr %p) {
2116
; CHECK-LABEL: foo_ld2_nxv8i16:
2217
; CHECK: // %bb.0:
23-
; CHECK-NEXT: zip1 p1.h, p0.h, p0.h
24-
; CHECK-NEXT: zip2 p0.h, p0.h, p0.h
25-
; CHECK-NEXT: ld1h { z2.h }, p1/z, [x0]
26-
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, #1, mul vl]
27-
; CHECK-NEXT: uzp1 z0.h, z2.h, z1.h
28-
; CHECK-NEXT: uzp2 z1.h, z2.h, z1.h
18+
; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0]
2919
; CHECK-NEXT: ret
3020
%interleaved.mask = call <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
3121
%wide.masked.vec = call <vscale x 16 x i16> @llvm.masked.load.nxv16i16.p0(ptr %p, i32 2, <vscale x 16 x i1> %interleaved.mask, <vscale x 16 x i16> poison)
@@ -36,12 +26,7 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16> } @foo_ld2_nxv8i16(<vscale x 8 x
3626
define { <vscale x 4 x float>, <vscale x 4 x float> } @foo_ld2_nxv4f32(<vscale x 4 x i1> %mask, ptr %p) {
3727
; CHECK-LABEL: foo_ld2_nxv4f32:
3828
; CHECK: // %bb.0:
39-
; CHECK-NEXT: zip1 p1.s, p0.s, p0.s
40-
; CHECK-NEXT: zip2 p0.s, p0.s, p0.s
41-
; CHECK-NEXT: ld1w { z2.s }, p1/z, [x0]
42-
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl]
43-
; CHECK-NEXT: uzp1 z0.s, z2.s, z1.s
44-
; CHECK-NEXT: uzp2 z1.s, z2.s, z1.s
29+
; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0]
4530
; CHECK-NEXT: ret
4631
%interleaved.mask = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask)
4732
%wide.masked.vec = call <vscale x 8 x float> @llvm.masked.load.nxv8f32(ptr %p, i32 4, <vscale x 8 x i1> %interleaved.mask, <vscale x 8 x float> poison)
@@ -52,12 +37,7 @@ define { <vscale x 4 x float>, <vscale x 4 x float> } @foo_ld2_nxv4f32(<vscale x
5237
define { <vscale x 2 x double>, <vscale x 2 x double> } @foo_ld2_nxv2f64(<vscale x 2 x i1> %mask, ptr %p) {
5338
; CHECK-LABEL: foo_ld2_nxv2f64:
5439
; CHECK: // %bb.0:
55-
; CHECK-NEXT: zip1 p1.d, p0.d, p0.d
56-
; CHECK-NEXT: zip2 p0.d, p0.d, p0.d
57-
; CHECK-NEXT: ld1d { z2.d }, p1/z, [x0]
58-
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #1, mul vl]
59-
; CHECK-NEXT: uzp1 z0.d, z2.d, z1.d
60-
; CHECK-NEXT: uzp2 z1.d, z2.d, z1.d
40+
; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0]
6141
; CHECK-NEXT: ret
6242
%interleaved.mask = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
6343
%wide.masked.vec = call <vscale x 4 x double> @llvm.masked.load.nxv4f64(ptr %p, i32 8, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison)
@@ -68,24 +48,7 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @foo_ld2_nxv2f64(<vscale
6848
define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8(<vscale x 16 x i1> %mask, ptr %p) {
6949
; CHECK-LABEL: foo_ld4_nxv16i8:
7050
; CHECK: // %bb.0:
71-
; CHECK-NEXT: zip2 p1.b, p0.b, p0.b
72-
; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
73-
; CHECK-NEXT: zip1 p2.b, p1.b, p1.b
74-
; CHECK-NEXT: zip2 p1.b, p1.b, p1.b
75-
; CHECK-NEXT: zip2 p3.b, p0.b, p0.b
76-
; CHECK-NEXT: ld1b { z3.b }, p2/z, [x0, #2, mul vl]
77-
; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
78-
; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, #3, mul vl]
79-
; CHECK-NEXT: ld1b { z0.b }, p3/z, [x0, #1, mul vl]
80-
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0]
81-
; CHECK-NEXT: uzp1 z4.b, z3.b, z2.b
82-
; CHECK-NEXT: uzp2 z3.b, z3.b, z2.b
83-
; CHECK-NEXT: uzp1 z5.b, z1.b, z0.b
84-
; CHECK-NEXT: uzp2 z6.b, z1.b, z0.b
85-
; CHECK-NEXT: uzp1 z0.b, z5.b, z4.b
86-
; CHECK-NEXT: uzp1 z1.b, z6.b, z3.b
87-
; CHECK-NEXT: uzp2 z2.b, z5.b, z4.b
88-
; CHECK-NEXT: uzp2 z3.b, z6.b, z3.b
51+
; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0]
8952
; CHECK-NEXT: ret
9053
%interleaved.mask = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
9154
%wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr %p, i32 1, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
@@ -96,24 +59,7 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
9659
define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @foo_ld4_nxv8i16(<vscale x 8 x i1> %mask, ptr %p) {
9760
; CHECK-LABEL: foo_ld4_nxv8i16:
9861
; CHECK: // %bb.0:
99-
; CHECK-NEXT: zip2 p1.h, p0.h, p0.h
100-
; CHECK-NEXT: zip1 p0.h, p0.h, p0.h
101-
; CHECK-NEXT: zip1 p2.h, p1.h, p1.h
102-
; CHECK-NEXT: zip2 p1.h, p1.h, p1.h
103-
; CHECK-NEXT: zip2 p3.h, p0.h, p0.h
104-
; CHECK-NEXT: ld1h { z3.h }, p2/z, [x0, #2, mul vl]
105-
; CHECK-NEXT: zip1 p0.h, p0.h, p0.h
106-
; CHECK-NEXT: ld1h { z2.h }, p1/z, [x0, #3, mul vl]
107-
; CHECK-NEXT: ld1h { z0.h }, p3/z, [x0, #1, mul vl]
108-
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
109-
; CHECK-NEXT: uzp1 z4.h, z3.h, z2.h
110-
; CHECK-NEXT: uzp2 z3.h, z3.h, z2.h
111-
; CHECK-NEXT: uzp1 z5.h, z1.h, z0.h
112-
; CHECK-NEXT: uzp2 z6.h, z1.h, z0.h
113-
; CHECK-NEXT: uzp1 z0.h, z5.h, z4.h
114-
; CHECK-NEXT: uzp1 z1.h, z6.h, z3.h
115-
; CHECK-NEXT: uzp2 z2.h, z5.h, z4.h
116-
; CHECK-NEXT: uzp2 z3.h, z6.h, z3.h
62+
; CHECK-NEXT: ld4h { z0.h - z3.h }, p0/z, [x0]
11763
; CHECK-NEXT: ret
11864
%interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
11965
%wide.masked.vec = call <vscale x 32 x i16> @llvm.masked.load.nxv32i16(ptr %p, i32 2, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i16> poison)
@@ -124,24 +70,7 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
12470
define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @foo_ld4_nxv4f32(<vscale x 4 x i1> %mask, ptr %p) {
12571
; CHECK-LABEL: foo_ld4_nxv4f32:
12672
; CHECK: // %bb.0:
127-
; CHECK-NEXT: zip2 p1.s, p0.s, p0.s
128-
; CHECK-NEXT: zip1 p0.s, p0.s, p0.s
129-
; CHECK-NEXT: zip1 p2.s, p1.s, p1.s
130-
; CHECK-NEXT: zip2 p1.s, p1.s, p1.s
131-
; CHECK-NEXT: zip2 p3.s, p0.s, p0.s
132-
; CHECK-NEXT: ld1w { z3.s }, p2/z, [x0, #2, mul vl]
133-
; CHECK-NEXT: zip1 p0.s, p0.s, p0.s
134-
; CHECK-NEXT: ld1w { z2.s }, p1/z, [x0, #3, mul vl]
135-
; CHECK-NEXT: ld1w { z0.s }, p3/z, [x0, #1, mul vl]
136-
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
137-
; CHECK-NEXT: uzp1 z4.s, z3.s, z2.s
138-
; CHECK-NEXT: uzp2 z3.s, z3.s, z2.s
139-
; CHECK-NEXT: uzp1 z5.s, z1.s, z0.s
140-
; CHECK-NEXT: uzp2 z6.s, z1.s, z0.s
141-
; CHECK-NEXT: uzp1 z0.s, z5.s, z4.s
142-
; CHECK-NEXT: uzp1 z1.s, z6.s, z3.s
143-
; CHECK-NEXT: uzp2 z2.s, z5.s, z4.s
144-
; CHECK-NEXT: uzp2 z3.s, z6.s, z3.s
73+
; CHECK-NEXT: ld4w { z0.s - z3.s }, p0/z, [x0]
14574
; CHECK-NEXT: ret
14675
%interleaved.mask = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask)
14776
%wide.masked.vec = call <vscale x 16 x float> @llvm.masked.load.nxv16f32(ptr %p, i32 4, <vscale x 16 x i1> %interleaved.mask, <vscale x 16 x float> poison)
@@ -152,24 +81,7 @@ define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vsca
15281
define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @foo_ld4_nxv2f64(<vscale x 2 x i1> %mask, ptr %p) {
15382
; CHECK-LABEL: foo_ld4_nxv2f64:
15483
; CHECK: // %bb.0:
155-
; CHECK-NEXT: zip2 p1.d, p0.d, p0.d
156-
; CHECK-NEXT: zip1 p0.d, p0.d, p0.d
157-
; CHECK-NEXT: zip1 p2.d, p1.d, p1.d
158-
; CHECK-NEXT: zip2 p1.d, p1.d, p1.d
159-
; CHECK-NEXT: zip2 p3.d, p0.d, p0.d
160-
; CHECK-NEXT: ld1d { z3.d }, p2/z, [x0, #2, mul vl]
161-
; CHECK-NEXT: zip1 p0.d, p0.d, p0.d
162-
; CHECK-NEXT: ld1d { z2.d }, p1/z, [x0, #3, mul vl]
163-
; CHECK-NEXT: ld1d { z0.d }, p3/z, [x0, #1, mul vl]
164-
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
165-
; CHECK-NEXT: uzp1 z4.d, z3.d, z2.d
166-
; CHECK-NEXT: uzp2 z3.d, z3.d, z2.d
167-
; CHECK-NEXT: uzp1 z5.d, z1.d, z0.d
168-
; CHECK-NEXT: uzp2 z6.d, z1.d, z0.d
169-
; CHECK-NEXT: uzp1 z0.d, z5.d, z4.d
170-
; CHECK-NEXT: uzp1 z1.d, z6.d, z3.d
171-
; CHECK-NEXT: uzp2 z2.d, z5.d, z4.d
172-
; CHECK-NEXT: uzp2 z3.d, z6.d, z3.d
84+
; CHECK-NEXT: ld4d { z0.d - z3.d }, p0/z, [x0]
17385
; CHECK-NEXT: ret
17486
%interleaved.mask = call <vscale x 8 x i1> @llvm.vector.interleave4.nxv8i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
17587
%wide.masked.vec = call <vscale x 8 x double> @llvm.masked.load.nxv8f64(ptr %p, i32 8, <vscale x 8 x i1> %interleaved.mask, <vscale x 8 x double> poison)
@@ -181,28 +93,17 @@ define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <v
18193
define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8_mul_use_of_mask(<vscale x 16 x i1> %mask, ptr %p, ptr %p2) {
18294
; CHECK-LABEL: foo_ld4_nxv16i8_mul_use_of_mask:
18395
; CHECK: // %bb.0:
184-
; CHECK-NEXT: zip2 p1.b, p0.b, p0.b
18596
; CHECK-NEXT: zip1 p2.b, p0.b, p0.b
186-
; CHECK-NEXT: zip1 p0.b, p1.b, p1.b
187-
; CHECK-NEXT: zip2 p1.b, p1.b, p1.b
188-
; CHECK-NEXT: zip2 p3.b, p2.b, p2.b
189-
; CHECK-NEXT: ld1b { z3.b }, p0/z, [x0, #2, mul vl]
190-
; CHECK-NEXT: zip1 p2.b, p2.b, p2.b
191-
; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, #3, mul vl]
192-
; CHECK-NEXT: ld1b { z0.b }, p3/z, [x0, #1, mul vl]
193-
; CHECK-NEXT: ld1b { z1.b }, p2/z, [x0]
194-
; CHECK-NEXT: uzp1 z4.b, z3.b, z2.b
195-
; CHECK-NEXT: uzp2 z3.b, z3.b, z2.b
196-
; CHECK-NEXT: uzp1 z5.b, z1.b, z0.b
197-
; CHECK-NEXT: uzp2 z6.b, z1.b, z0.b
198-
; CHECK-NEXT: uzp1 z0.b, z5.b, z4.b
199-
; CHECK-NEXT: uzp1 z1.b, z6.b, z3.b
200-
; CHECK-NEXT: uzp2 z2.b, z5.b, z4.b
201-
; CHECK-NEXT: uzp2 z3.b, z6.b, z3.b
202-
; CHECK-NEXT: // fake_use: $p2
97+
; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0]
98+
; CHECK-NEXT: zip2 p1.b, p0.b, p0.b
99+
; CHECK-NEXT: zip1 p3.b, p2.b, p2.b
100+
; CHECK-NEXT: zip2 p0.b, p1.b, p1.b
101+
; CHECK-NEXT: zip1 p1.b, p1.b, p1.b
102+
; CHECK-NEXT: zip2 p2.b, p2.b, p2.b
203103
; CHECK-NEXT: // fake_use: $p3
204-
; CHECK-NEXT: // fake_use: $p0
104+
; CHECK-NEXT: // fake_use: $p2
205105
; CHECK-NEXT: // fake_use: $p1
106+
; CHECK-NEXT: // fake_use: $p0
206107
; CHECK-NEXT: ret
207108
%interleaved.mask = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
208109
%wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr %p, i32 4, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
@@ -214,18 +115,8 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
214115
define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8_mask_of_interleaved_ones(ptr %p) {
215116
; CHECK-LABEL: foo_ld4_nxv16i8_mask_of_interleaved_ones:
216117
; CHECK: // %bb.0:
217-
; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
218-
; CHECK-NEXT: ldr z1, [x0]
219-
; CHECK-NEXT: ldr z2, [x0, #3, mul vl]
220-
; CHECK-NEXT: ldr z3, [x0, #2, mul vl]
221-
; CHECK-NEXT: uzp1 z5.b, z1.b, z0.b
222-
; CHECK-NEXT: uzp2 z6.b, z1.b, z0.b
223-
; CHECK-NEXT: uzp1 z4.b, z3.b, z2.b
224-
; CHECK-NEXT: uzp2 z3.b, z3.b, z2.b
225-
; CHECK-NEXT: uzp1 z0.b, z5.b, z4.b
226-
; CHECK-NEXT: uzp1 z1.b, z6.b, z3.b
227-
; CHECK-NEXT: uzp2 z2.b, z5.b, z4.b
228-
; CHECK-NEXT: uzp2 z3.b, z6.b, z3.b
118+
; CHECK-NEXT: ptrue p0.b
119+
; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0]
229120
; CHECK-NEXT: ret
230121
%interleaved.mask = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> splat(i1 1), <vscale x 16 x i1> splat(i1 1), <vscale x 16 x i1> splat(i1 1), <vscale x 16 x i1> splat(i1 1))
231122
%wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr %p, i32 4, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
@@ -236,18 +127,8 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
236127
define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8_mask_of_ones(ptr %p) {
237128
; CHECK-LABEL: foo_ld4_nxv16i8_mask_of_ones:
238129
; CHECK: // %bb.0:
239-
; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
240-
; CHECK-NEXT: ldr z1, [x0]
241-
; CHECK-NEXT: ldr z2, [x0, #3, mul vl]
242-
; CHECK-NEXT: ldr z3, [x0, #2, mul vl]
243-
; CHECK-NEXT: uzp1 z5.b, z1.b, z0.b
244-
; CHECK-NEXT: uzp2 z6.b, z1.b, z0.b
245-
; CHECK-NEXT: uzp1 z4.b, z3.b, z2.b
246-
; CHECK-NEXT: uzp2 z3.b, z3.b, z2.b
247-
; CHECK-NEXT: uzp1 z0.b, z5.b, z4.b
248-
; CHECK-NEXT: uzp1 z1.b, z6.b, z3.b
249-
; CHECK-NEXT: uzp2 z2.b, z5.b, z4.b
250-
; CHECK-NEXT: uzp2 z3.b, z6.b, z3.b
130+
; CHECK-NEXT: ptrue p0.b
131+
; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0]
251132
; CHECK-NEXT: ret
252133
%wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr %p, i32 4, <vscale x 64 x i1> splat(i1 1), <vscale x 64 x i8> poison)
253134
%deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec)

0 commit comments

Comments
 (0)