You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
[AArch64] Improve lowering for scalable masked deinterleaving loads
For IR like this:
%mask = ... @llvm.vector.interleave2(<vscale x 16 x i1> %a, <vscale x 16 x i1> %a)
%vec = ... @llvm.masked.load(..., <vscale x 32 x i1> %mask, ...)
%dvec = ... @llvm.vector.deinterleave2(<vscale x 32 x i8> %vec)
where we're deinterleaving a wide masked load of the supported type
and with an interleaved mask we can lower this directly to a ld2b
instruction. Similarly we can also support other variants of ld2
and ld4.
This PR adds a DAG combine to spot such patterns and lower to ld2X
or ld4X variants accordingly, whilst being careful to ensure the
masked load is only used by the deinterleave intrinsic.
%interleaved.mask = call <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
31
21
%wide.masked.vec = call <vscale x 16 x i16> @llvm.masked.load.nxv16i16.p0(ptr%p, i322, <vscale x 16 x i1> %interleaved.mask, <vscale x 16 x i16> poison)
@@ -36,12 +26,7 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16> } @foo_ld2_nxv8i16(<vscale x 8 x
36
26
define { <vscale x 4 x float>, <vscale x 4 x float> } @foo_ld2_nxv4f32(<vscale x 4 x i1> %mask, ptr%p) {
%interleaved.mask = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
63
43
%wide.masked.vec = call <vscale x 4 x double> @llvm.masked.load.nxv4f64(ptr%p, i328, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison)
@@ -68,24 +48,7 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @foo_ld2_nxv2f64(<vscale
68
48
define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8(<vscale x 16 x i1> %mask, ptr%p) {
%interleaved.mask = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
91
54
%wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr%p, i321, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
@@ -96,24 +59,7 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
96
59
define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @foo_ld4_nxv8i16(<vscale x 8 x i1> %mask, ptr%p) {
%interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
119
65
%wide.masked.vec = call <vscale x 32 x i16> @llvm.masked.load.nxv32i16(ptr%p, i322, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i16> poison)
@@ -124,24 +70,7 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
124
70
define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @foo_ld4_nxv4f32(<vscale x 4 x i1> %mask, ptr%p) {
%interleaved.mask = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask)
147
76
%wide.masked.vec = call <vscale x 16 x float> @llvm.masked.load.nxv16f32(ptr%p, i324, <vscale x 16 x i1> %interleaved.mask, <vscale x 16 x float> poison)
@@ -152,24 +81,7 @@ define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vsca
152
81
define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @foo_ld4_nxv2f64(<vscale x 2 x i1> %mask, ptr%p) {
%interleaved.mask = call <vscale x 8 x i1> @llvm.vector.interleave4.nxv8i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
175
87
%wide.masked.vec = call <vscale x 8 x double> @llvm.masked.load.nxv8f64(ptr%p, i328, <vscale x 8 x i1> %interleaved.mask, <vscale x 8 x double> poison)
@@ -181,28 +93,17 @@ define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <v
181
93
define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8_mul_use_of_mask(<vscale x 16 x i1> %mask, ptr%p, ptr%p2) {
%interleaved.mask = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
208
109
%wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr%p, i324, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
@@ -214,18 +115,8 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
214
115
define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8_mask_of_interleaved_ones(ptr%p) {
%interleaved.mask = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> splat(i11), <vscale x 16 x i1> splat(i11), <vscale x 16 x i1> splat(i11), <vscale x 16 x i1> splat(i11))
231
122
%wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr%p, i324, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
@@ -236,18 +127,8 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
236
127
define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8_mask_of_ones(ptr%p) {
237
128
; CHECK-LABEL: foo_ld4_nxv16i8_mask_of_ones:
238
129
; CHECK: // %bb.0:
239
-
; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
240
-
; CHECK-NEXT: ldr z1, [x0]
241
-
; CHECK-NEXT: ldr z2, [x0, #3, mul vl]
242
-
; CHECK-NEXT: ldr z3, [x0, #2, mul vl]
243
-
; CHECK-NEXT: uzp1 z5.b, z1.b, z0.b
244
-
; CHECK-NEXT: uzp2 z6.b, z1.b, z0.b
245
-
; CHECK-NEXT: uzp1 z4.b, z3.b, z2.b
246
-
; CHECK-NEXT: uzp2 z3.b, z3.b, z2.b
247
-
; CHECK-NEXT: uzp1 z0.b, z5.b, z4.b
248
-
; CHECK-NEXT: uzp1 z1.b, z6.b, z3.b
249
-
; CHECK-NEXT: uzp2 z2.b, z5.b, z4.b
250
-
; CHECK-NEXT: uzp2 z3.b, z6.b, z3.b
130
+
; CHECK-NEXT: ptrue p0.b
131
+
; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0]
251
132
; CHECK-NEXT: ret
252
133
%wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr%p, i324, <vscale x 64 x i1> splat(i11), <vscale x 64 x i8> poison)
253
134
%deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec)
0 commit comments