@@ -5,13 +5,8 @@ define <8 x i8> @load4_v4i8_add(float %tmp, <4 x i8> *%a, <4 x i8> *%b) {
55; CHECK-LABEL: load4_v4i8_add:
66; CHECK: // %bb.0:
77; CHECK-NEXT: ldp s0, s1, [x0]
8- ; CHECK-NEXT: ldp s2, s3, [x1]
9- ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
10- ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
11- ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
12- ; CHECK-NEXT: ushll v3.8h, v3.8b, #0
13- ; CHECK-NEXT: uzp1 v0.8b, v0.8b, v2.8b
14- ; CHECK-NEXT: uzp1 v1.8b, v1.8b, v3.8b
8+ ; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
9+ ; CHECK-NEXT: ld1 { v1.s }[1], [x1]
1510; CHECK-NEXT: add v0.8b, v0.8b, v1.8b
1611; CHECK-NEXT: ret
1712 %la = load <4 x i8 >, <4 x i8 > *%a
@@ -30,13 +25,8 @@ define <8 x i16> @load4_v4i8_zext_add(float %tmp, <4 x i8> *%a, <4 x i8> *%b) {
3025; CHECK-LABEL: load4_v4i8_zext_add:
3126; CHECK: // %bb.0:
3227; CHECK-NEXT: ldp s0, s1, [x0]
33- ; CHECK-NEXT: ldp s2, s3, [x1]
34- ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
35- ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
36- ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
37- ; CHECK-NEXT: ushll v3.8h, v3.8b, #0
38- ; CHECK-NEXT: uzp1 v0.8b, v0.8b, v2.8b
39- ; CHECK-NEXT: uzp1 v1.8b, v1.8b, v3.8b
28+ ; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
29+ ; CHECK-NEXT: ld1 { v1.s }[1], [x1]
4030; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
4131; CHECK-NEXT: ret
4232 %la = load <4 x i8 >, <4 x i8 > *%a
@@ -59,103 +49,49 @@ define i32 @large(i8* nocapture noundef readonly %p1, i32 noundef %st1, i8* noca
5949; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
6050; CHECK-NEXT: sxtw x8, w1
6151; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3
62- ; CHECK-NEXT: sxtw x9, w3
63- ; CHECK-NEXT: ldp s0, s1, [x0]
64- ; CHECK-NEXT: ldp s2, s3, [x2]
65- ; CHECK-NEXT: add x10, x0, x8
66- ; CHECK-NEXT: add x11, x2, x9
67- ; CHECK-NEXT: ushll v4.8h, v0.8b, #0
68- ; CHECK-NEXT: ushll v0.8h, v3.8b, #0
69- ; CHECK-NEXT: ldp s5, s3, [x10]
70- ; CHECK-NEXT: add x10, x10, x8
52+ ; CHECK-NEXT: sxtw x11, w3
53+ ; CHECK-NEXT: add x9, x0, x8
54+ ; CHECK-NEXT: add x12, x2, x11
55+ ; CHECK-NEXT: add x10, x9, x8
56+ ; CHECK-NEXT: add x13, x12, x11
7157; CHECK-NEXT: add x8, x10, x8
72- ; CHECK-NEXT: ldp s6, s7, [x11]
73- ; CHECK-NEXT: ldp s16, s17, [x10]
74- ; CHECK-NEXT: ldp s18, s21, [x8]
75- ; CHECK-NEXT: add x11, x11, x9
76- ; CHECK-NEXT: add x9, x11, x9
77- ; CHECK-NEXT: ushll v5.8h, v5.8b, #0
78- ; CHECK-NEXT: ushll v16.8h, v16.8b, #0
79- ; CHECK-NEXT: ushll v18.8h, v18.8b, #0
80- ; CHECK-NEXT: ldp s19, s20, [x11]
81- ; CHECK-NEXT: uzp1 v16.8b, v18.8b, v16.8b
82- ; CHECK-NEXT: uzp1 v4.8b, v5.8b, v4.8b
83- ; CHECK-NEXT: ldp s18, s5, [x9]
84- ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
85- ; CHECK-NEXT: ushll v6.8h, v6.8b, #0
86- ; CHECK-NEXT: ushll v19.8h, v19.8b, #0
87- ; CHECK-NEXT: ushll v18.8h, v18.8b, #0
88- ; CHECK-NEXT: uzp1 v2.8b, v6.8b, v2.8b
89- ; CHECK-NEXT: uzp1 v18.8b, v18.8b, v19.8b
58+ ; CHECK-NEXT: add x11, x13, x11
59+ ; CHECK-NEXT: ldp s1, s5, [x9]
60+ ; CHECK-NEXT: ldp s0, s4, [x8]
61+ ; CHECK-NEXT: ld1 { v0.s }[1], [x10], #4
62+ ; CHECK-NEXT: ld1 { v1.s }[1], [x0], #4
63+ ; CHECK-NEXT: ldp s2, s6, [x11]
64+ ; CHECK-NEXT: ldp s3, s7, [x12]
65+ ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
9066; CHECK-NEXT: ushll v1.8h, v1.8b, #0
91- ; CHECK-NEXT: ushll v3.8h, v3.8b , #0
92- ; CHECK-NEXT: ushll v17.8h, v17.8b , #0
93- ; CHECK-NEXT: ushll v20.8h, v20.8b, #0
94- ; CHECK-NEXT: ushll v6.8h, v16.8b, #0
95- ; CHECK-NEXT: ushll v4.8h, v4.8b, #0
96- ; CHECK-NEXT: ushll v16.8h, v18.8b, #0
67+ ; CHECK-NEXT: ld1 { v2.s }[1], [x13] , #4
68+ ; CHECK-NEXT: ld1 { v3.s }[1], [x2] , #4
69+ ; CHECK-NEXT: ld1 { v4.s }[1], [x10]
70+ ; CHECK-NEXT: ld1 { v5.s }[1], [x0]
71+ ; CHECK-NEXT: ld1 { v6.s }[1], [x13]
72+ ; CHECK-NEXT: ld1 { v7.s }[1], [x2]
9773; CHECK-NEXT: ushll v2.8h, v2.8b, #0
98- ; CHECK-NEXT: ushll v19.8h, v21.8b, #0
99- ; CHECK-NEXT: ushll v5.8h, v5.8b, #0
100- ; CHECK-NEXT: ushll v7.8h, v7.8b, #0
101- ; CHECK-NEXT: usubl v18.4s, v6.4h, v16.4h
102- ; CHECK-NEXT: usubl2 v6.4s, v6.8h, v16.8h
103- ; CHECK-NEXT: usubl v16.4s, v4.4h, v2.4h
104- ; CHECK-NEXT: usubl2 v2.4s, v4.8h, v2.8h
105- ; CHECK-NEXT: uzp1 v4.8b, v19.8b, v17.8b
106- ; CHECK-NEXT: uzp1 v1.8b, v3.8b, v1.8b
107- ; CHECK-NEXT: uzp1 v3.8b, v5.8b, v20.8b
108- ; CHECK-NEXT: uzp1 v0.8b, v7.8b, v0.8b
109- ; CHECK-NEXT: ushll v4.8h, v4.8b, #0
11074; CHECK-NEXT: ushll v3.8h, v3.8b, #0
111- ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
112- ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
113- ; CHECK-NEXT: usubl2 v5.4s, v4.8h, v3.8h
114- ; CHECK-NEXT: usubl v3.4s, v4.4h, v3.4h
115- ; CHECK-NEXT: usubl2 v4.4s, v1.8h, v0.8h
116- ; CHECK-NEXT: usubl v0.4s, v1.4h, v0.4h
117- ; CHECK-NEXT: shl v1.4s, v3.4s, #16
118- ; CHECK-NEXT: shl v3.4s, v5.4s, #16
75+ ; CHECK-NEXT: usubl v16.4s, v0.4h, v2.4h
76+ ; CHECK-NEXT: usubl2 v0.4s, v0.8h, v2.8h
77+ ; CHECK-NEXT: usubl v2.4s, v1.4h, v3.4h
78+ ; CHECK-NEXT: usubl2 v1.4s, v1.8h, v3.8h
79+ ; CHECK-NEXT: ushll v3.8h, v4.8b, #0
80+ ; CHECK-NEXT: ushll v4.8h, v5.8b, #0
81+ ; CHECK-NEXT: ushll v5.8h, v6.8b, #0
82+ ; CHECK-NEXT: ushll v6.8h, v7.8b, #0
83+ ; CHECK-NEXT: usubl2 v7.4s, v3.8h, v5.8h
84+ ; CHECK-NEXT: usubl v3.4s, v3.4h, v5.4h
85+ ; CHECK-NEXT: usubl2 v5.4s, v4.8h, v6.8h
86+ ; CHECK-NEXT: usubl v4.4s, v4.4h, v6.4h
87+ ; CHECK-NEXT: shl v3.4s, v3.4s, #16
88+ ; CHECK-NEXT: shl v6.4s, v7.4s, #16
89+ ; CHECK-NEXT: shl v5.4s, v5.4s, #16
11990; CHECK-NEXT: shl v4.4s, v4.4s, #16
120- ; CHECK-NEXT: add v1 .4s, v1 .4s, v18 .4s
121- ; CHECK-NEXT: shl v0 .4s, v0 .4s, #16
122- ; CHECK-NEXT: add v3 .4s, v3 .4s, v6 .4s
91+ ; CHECK-NEXT: add v0 .4s, v6 .4s, v0 .4s
92+ ; CHECK-NEXT: add v3 .4s, v3 .4s, v16.4s
93+ ; CHECK-NEXT: add v1 .4s, v5 .4s, v1 .4s
12394; CHECK-NEXT: add v2.4s, v4.4s, v2.4s
124- ; CHECK-NEXT: rev64 v4.4s, v3.4s
125- ; CHECK-NEXT: rev64 v5.4s, v1.4s
126- ; CHECK-NEXT: add v0.4s, v0.4s, v16.4s
127- ; CHECK-NEXT: rev64 v6.4s, v2.4s
128- ; CHECK-NEXT: rev64 v7.4s, v0.4s
129- ; CHECK-NEXT: add v16.4s, v3.4s, v4.4s
130- ; CHECK-NEXT: add v17.4s, v1.4s, v5.4s
131- ; CHECK-NEXT: sub v1.4s, v1.4s, v5.4s
132- ; CHECK-NEXT: trn2 v5.4s, v16.4s, v17.4s
133- ; CHECK-NEXT: add v18.4s, v2.4s, v6.4s
134- ; CHECK-NEXT: add v19.4s, v0.4s, v7.4s
135- ; CHECK-NEXT: sub v2.4s, v2.4s, v6.4s
136- ; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s
137- ; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s
138- ; CHECK-NEXT: trn2 v4.4s, v19.4s, v18.4s
139- ; CHECK-NEXT: ext v6.16b, v5.16b, v16.16b, #8
140- ; CHECK-NEXT: zip1 v7.4s, v0.4s, v2.4s
141- ; CHECK-NEXT: trn2 v16.4s, v17.4s, v16.4s
142- ; CHECK-NEXT: ext v4.16b, v19.16b, v4.16b, #8
143- ; CHECK-NEXT: zip1 v20.4s, v3.4s, v1.4s
144- ; CHECK-NEXT: ext v7.16b, v0.16b, v7.16b, #8
145- ; CHECK-NEXT: ext v17.16b, v16.16b, v17.16b, #8
146- ; CHECK-NEXT: zip2 v18.4s, v19.4s, v18.4s
147- ; CHECK-NEXT: zip2 v1.4s, v3.4s, v1.4s
148- ; CHECK-NEXT: mov v0.s[3], v2.s[2]
149- ; CHECK-NEXT: mov v5.d[1], v4.d[1]
150- ; CHECK-NEXT: mov v20.d[1], v7.d[1]
151- ; CHECK-NEXT: mov v17.d[1], v18.d[1]
152- ; CHECK-NEXT: mov v16.d[1], v4.d[1]
153- ; CHECK-NEXT: mov v1.d[1], v0.d[1]
154- ; CHECK-NEXT: mov v6.d[1], v18.d[1]
155- ; CHECK-NEXT: add v0.4s, v17.4s, v16.4s
156- ; CHECK-NEXT: add v2.4s, v1.4s, v20.4s
157- ; CHECK-NEXT: sub v3.4s, v5.4s, v6.4s
158- ; CHECK-NEXT: sub v1.4s, v20.4s, v1.4s
15995; CHECK-NEXT: rev64 v4.4s, v0.4s
16096; CHECK-NEXT: rev64 v5.4s, v3.4s
16197; CHECK-NEXT: rev64 v6.4s, v1.4s
@@ -164,43 +100,77 @@ define i32 @large(i8* nocapture noundef readonly %p1, i32 noundef %st1, i8* noca
164100; CHECK-NEXT: add v17.4s, v3.4s, v5.4s
165101; CHECK-NEXT: add v18.4s, v1.4s, v6.4s
166102; CHECK-NEXT: add v19.4s, v2.4s, v7.4s
167- ; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s
168103; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s
169- ; CHECK-NEXT: sub v3 .4s, v3 .4s, v5 .4s
104+ ; CHECK-NEXT: sub v2 .4s, v2 .4s, v7 .4s
170105; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s
106+ ; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s
107+ ; CHECK-NEXT: trn2 v4.4s, v16.4s, v17.4s
108+ ; CHECK-NEXT: trn2 v5.4s, v19.4s, v18.4s
109+ ; CHECK-NEXT: zip1 v7.4s, v2.4s, v1.4s
110+ ; CHECK-NEXT: trn2 v20.4s, v17.4s, v16.4s
111+ ; CHECK-NEXT: zip1 v6.4s, v0.4s, v3.4s
112+ ; CHECK-NEXT: zip2 v18.4s, v19.4s, v18.4s
113+ ; CHECK-NEXT: ext v5.16b, v19.16b, v5.16b, #8
114+ ; CHECK-NEXT: ext v16.16b, v4.16b, v16.16b, #8
115+ ; CHECK-NEXT: ext v7.16b, v2.16b, v7.16b, #8
116+ ; CHECK-NEXT: ext v17.16b, v20.16b, v17.16b, #8
117+ ; CHECK-NEXT: zip2 v0.4s, v0.4s, v3.4s
118+ ; CHECK-NEXT: mov v2.s[3], v1.s[2]
119+ ; CHECK-NEXT: mov v4.d[1], v5.d[1]
120+ ; CHECK-NEXT: mov v6.d[1], v7.d[1]
121+ ; CHECK-NEXT: mov v17.d[1], v18.d[1]
122+ ; CHECK-NEXT: mov v20.d[1], v5.d[1]
123+ ; CHECK-NEXT: mov v0.d[1], v2.d[1]
124+ ; CHECK-NEXT: mov v16.d[1], v18.d[1]
125+ ; CHECK-NEXT: add v1.4s, v17.4s, v20.4s
126+ ; CHECK-NEXT: add v2.4s, v0.4s, v6.4s
127+ ; CHECK-NEXT: sub v3.4s, v4.4s, v16.4s
128+ ; CHECK-NEXT: sub v0.4s, v6.4s, v0.4s
129+ ; CHECK-NEXT: rev64 v4.4s, v1.4s
130+ ; CHECK-NEXT: rev64 v5.4s, v3.4s
131+ ; CHECK-NEXT: rev64 v6.4s, v0.4s
132+ ; CHECK-NEXT: rev64 v7.4s, v2.4s
133+ ; CHECK-NEXT: add v16.4s, v1.4s, v4.4s
134+ ; CHECK-NEXT: add v17.4s, v3.4s, v5.4s
135+ ; CHECK-NEXT: add v18.4s, v0.4s, v6.4s
136+ ; CHECK-NEXT: add v19.4s, v2.4s, v7.4s
137+ ; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s
138+ ; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s
139+ ; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s
140+ ; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s
171141; CHECK-NEXT: ext v4.16b, v2.16b, v19.16b, #12
172- ; CHECK-NEXT: ext v5.16b, v1 .16b, v18.16b, #12
142+ ; CHECK-NEXT: ext v5.16b, v0 .16b, v18.16b, #12
173143; CHECK-NEXT: ext v7.16b, v3.16b, v17.16b, #12
174144; CHECK-NEXT: rev64 v16.4s, v16.4s
175145; CHECK-NEXT: ext v6.16b, v4.16b, v2.16b, #4
176146; CHECK-NEXT: ext v17.16b, v4.16b, v4.16b, #8
177- ; CHECK-NEXT: ext v18.16b, v5.16b, v1 .16b, #4
147+ ; CHECK-NEXT: ext v18.16b, v5.16b, v0 .16b, #4
178148; CHECK-NEXT: ext v19.16b, v5.16b, v5.16b, #8
179149; CHECK-NEXT: ext v20.16b, v7.16b, v3.16b, #4
180150; CHECK-NEXT: ext v21.16b, v7.16b, v7.16b, #8
181151; CHECK-NEXT: rev64 v7.4s, v7.4s
182- ; CHECK-NEXT: trn2 v0 .4s, v16.4s, v0 .4s
152+ ; CHECK-NEXT: trn2 v1 .4s, v16.4s, v1 .4s
183153; CHECK-NEXT: rev64 v5.4s, v5.4s
184154; CHECK-NEXT: rev64 v4.4s, v4.4s
185155; CHECK-NEXT: ext v6.16b, v6.16b, v17.16b, #12
186156; CHECK-NEXT: ext v17.16b, v18.16b, v19.16b, #12
187157; CHECK-NEXT: ext v18.16b, v20.16b, v21.16b, #12
188158; CHECK-NEXT: ext v3.16b, v7.16b, v3.16b, #4
189- ; CHECK-NEXT: ext v7.16b, v0 .16b, v0 .16b, #8
190- ; CHECK-NEXT: ext v1 .16b, v5.16b, v1 .16b, #4
159+ ; CHECK-NEXT: ext v7.16b, v1 .16b, v1 .16b, #8
160+ ; CHECK-NEXT: ext v0 .16b, v5.16b, v0 .16b, #4
191161; CHECK-NEXT: ext v2.16b, v4.16b, v2.16b, #4
192162; CHECK-NEXT: add v4.4s, v18.4s, v3.4s
193- ; CHECK-NEXT: add v5.4s, v0 .4s, v7.4s
194- ; CHECK-NEXT: add v16.4s, v17.4s, v1 .4s
163+ ; CHECK-NEXT: add v5.4s, v1 .4s, v7.4s
164+ ; CHECK-NEXT: add v16.4s, v17.4s, v0 .4s
195165; CHECK-NEXT: add v19.4s, v6.4s, v2.4s
196166; CHECK-NEXT: sub v3.4s, v18.4s, v3.4s
197- ; CHECK-NEXT: sub v0 .4s, v0 .4s, v7.4s
167+ ; CHECK-NEXT: sub v1 .4s, v1 .4s, v7.4s
198168; CHECK-NEXT: sub v2.4s, v6.4s, v2.4s
199- ; CHECK-NEXT: sub v1 .4s, v17.4s, v1 .4s
169+ ; CHECK-NEXT: sub v0 .4s, v17.4s, v0 .4s
200170; CHECK-NEXT: mov v19.d[1], v2.d[1]
201- ; CHECK-NEXT: mov v16.d[1], v1 .d[1]
171+ ; CHECK-NEXT: mov v16.d[1], v0 .d[1]
202172; CHECK-NEXT: mov v4.d[1], v3.d[1]
203- ; CHECK-NEXT: mov v5.d[1], v0 .d[1]
173+ ; CHECK-NEXT: mov v5.d[1], v1 .d[1]
204174; CHECK-NEXT: movi v0.8h, #1
205175; CHECK-NEXT: movi v7.2d, #0x00ffff0000ffff
206176; CHECK-NEXT: ushr v1.4s, v4.4s, #15
0 commit comments