|
4 | 4 | ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
|
5 | 5 | ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
|
6 | 6 | ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
|
7 |
| -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 |
8 |
| -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP |
9 |
| -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ |
10 |
| -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP |
11 |
| -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW |
12 |
| -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP |
13 |
| -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW |
14 |
| -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP |
| 7 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512-VL |
| 8 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512-FCP |
| 9 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ |
| 10 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512DQ-FCP |
| 11 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW |
| 12 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW-FCP |
| 13 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512DQ-BW |
| 14 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512DQ-BW-FCP |
15 | 15 |
|
16 | 16 | ; These patterns are produced by LoopVectorizer for interleaved loads.
|
17 | 17 |
|
@@ -69,69 +69,6 @@ define void @load_i16_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
|
69 | 69 | ; AVX512-NEXT: vmovd %xmm1, (%rsi)
|
70 | 70 | ; AVX512-NEXT: vmovd %xmm0, (%rdx)
|
71 | 71 | ; AVX512-NEXT: retq
|
72 |
| -; |
73 |
| -; AVX512-FCP-LABEL: load_i16_stride2_vf2: |
74 |
| -; AVX512-FCP: # %bb.0: |
75 |
| -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
76 |
| -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] |
77 |
| -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] |
78 |
| -; AVX512-FCP-NEXT: vmovd %xmm1, (%rsi) |
79 |
| -; AVX512-FCP-NEXT: vmovd %xmm0, (%rdx) |
80 |
| -; AVX512-FCP-NEXT: retq |
81 |
| -; |
82 |
| -; AVX512DQ-LABEL: load_i16_stride2_vf2: |
83 |
| -; AVX512DQ: # %bb.0: |
84 |
| -; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
85 |
| -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] |
86 |
| -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] |
87 |
| -; AVX512DQ-NEXT: vmovd %xmm1, (%rsi) |
88 |
| -; AVX512DQ-NEXT: vmovd %xmm0, (%rdx) |
89 |
| -; AVX512DQ-NEXT: retq |
90 |
| -; |
91 |
| -; AVX512DQ-FCP-LABEL: load_i16_stride2_vf2: |
92 |
| -; AVX512DQ-FCP: # %bb.0: |
93 |
| -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
94 |
| -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] |
95 |
| -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] |
96 |
| -; AVX512DQ-FCP-NEXT: vmovd %xmm1, (%rsi) |
97 |
| -; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%rdx) |
98 |
| -; AVX512DQ-FCP-NEXT: retq |
99 |
| -; |
100 |
| -; AVX512BW-LABEL: load_i16_stride2_vf2: |
101 |
| -; AVX512BW: # %bb.0: |
102 |
| -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
103 |
| -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] |
104 |
| -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] |
105 |
| -; AVX512BW-NEXT: vmovd %xmm1, (%rsi) |
106 |
| -; AVX512BW-NEXT: vmovd %xmm0, (%rdx) |
107 |
| -; AVX512BW-NEXT: retq |
108 |
| -; |
109 |
| -; AVX512BW-FCP-LABEL: load_i16_stride2_vf2: |
110 |
| -; AVX512BW-FCP: # %bb.0: |
111 |
| -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
112 |
| -; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] |
113 |
| -; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] |
114 |
| -; AVX512BW-FCP-NEXT: vmovd %xmm1, (%rsi) |
115 |
| -; AVX512BW-FCP-NEXT: vmovd %xmm0, (%rdx) |
116 |
| -; AVX512BW-FCP-NEXT: retq |
117 |
| -; |
118 |
| -; AVX512DQ-BW-LABEL: load_i16_stride2_vf2: |
119 |
| -; AVX512DQ-BW: # %bb.0: |
120 |
| -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
121 |
| -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] |
122 |
| -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] |
123 |
| -; AVX512DQ-BW-NEXT: vmovd %xmm1, (%rsi) |
124 |
| -; AVX512DQ-BW-NEXT: vmovd %xmm0, (%rdx) |
125 |
| -; AVX512DQ-BW-NEXT: retq |
126 |
| -; |
127 |
| -; AVX512DQ-BW-FCP-LABEL: load_i16_stride2_vf2: |
128 |
| -; AVX512DQ-BW-FCP: # %bb.0: |
129 |
| -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero |
130 |
| -; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] |
131 |
| -; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] |
132 |
| -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm1, (%rsi) |
133 |
| -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%rdx) |
134 |
| -; AVX512DQ-BW-FCP-NEXT: retq |
135 | 72 | %wide.vec = load <4 x i16>, ptr %in.vec, align 64
|
136 | 73 | %strided.vec0 = shufflevector <4 x i16> %wide.vec, <4 x i16> poison, <2 x i32> <i32 0, i32 2>
|
137 | 74 | %strided.vec1 = shufflevector <4 x i16> %wide.vec, <4 x i16> poison, <2 x i32> <i32 1, i32 3>
|
@@ -198,62 +135,6 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
|
198 | 135 | ; AVX512-NEXT: vpmovdw %xmm0, (%rsi)
|
199 | 136 | ; AVX512-NEXT: vmovq %xmm1, (%rdx)
|
200 | 137 | ; AVX512-NEXT: retq
|
201 |
| -; |
202 |
| -; AVX512-FCP-LABEL: load_i16_stride2_vf4: |
203 |
| -; AVX512-FCP: # %bb.0: |
204 |
| -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 |
205 |
| -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] |
206 |
| -; AVX512-FCP-NEXT: vpmovdw %xmm0, (%rsi) |
207 |
| -; AVX512-FCP-NEXT: vmovq %xmm1, (%rdx) |
208 |
| -; AVX512-FCP-NEXT: retq |
209 |
| -; |
210 |
| -; AVX512DQ-LABEL: load_i16_stride2_vf4: |
211 |
| -; AVX512DQ: # %bb.0: |
212 |
| -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 |
213 |
| -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] |
214 |
| -; AVX512DQ-NEXT: vpmovdw %xmm0, (%rsi) |
215 |
| -; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) |
216 |
| -; AVX512DQ-NEXT: retq |
217 |
| -; |
218 |
| -; AVX512DQ-FCP-LABEL: load_i16_stride2_vf4: |
219 |
| -; AVX512DQ-FCP: # %bb.0: |
220 |
| -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 |
221 |
| -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] |
222 |
| -; AVX512DQ-FCP-NEXT: vpmovdw %xmm0, (%rsi) |
223 |
| -; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rdx) |
224 |
| -; AVX512DQ-FCP-NEXT: retq |
225 |
| -; |
226 |
| -; AVX512BW-LABEL: load_i16_stride2_vf4: |
227 |
| -; AVX512BW: # %bb.0: |
228 |
| -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 |
229 |
| -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] |
230 |
| -; AVX512BW-NEXT: vpmovdw %xmm0, (%rsi) |
231 |
| -; AVX512BW-NEXT: vmovq %xmm1, (%rdx) |
232 |
| -; AVX512BW-NEXT: retq |
233 |
| -; |
234 |
| -; AVX512BW-FCP-LABEL: load_i16_stride2_vf4: |
235 |
| -; AVX512BW-FCP: # %bb.0: |
236 |
| -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 |
237 |
| -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] |
238 |
| -; AVX512BW-FCP-NEXT: vpmovdw %xmm0, (%rsi) |
239 |
| -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rdx) |
240 |
| -; AVX512BW-FCP-NEXT: retq |
241 |
| -; |
242 |
| -; AVX512DQ-BW-LABEL: load_i16_stride2_vf4: |
243 |
| -; AVX512DQ-BW: # %bb.0: |
244 |
| -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 |
245 |
| -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] |
246 |
| -; AVX512DQ-BW-NEXT: vpmovdw %xmm0, (%rsi) |
247 |
| -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) |
248 |
| -; AVX512DQ-BW-NEXT: retq |
249 |
| -; |
250 |
| -; AVX512DQ-BW-FCP-LABEL: load_i16_stride2_vf4: |
251 |
| -; AVX512DQ-BW-FCP: # %bb.0: |
252 |
| -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 |
253 |
| -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] |
254 |
| -; AVX512DQ-BW-FCP-NEXT: vpmovdw %xmm0, (%rsi) |
255 |
| -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rdx) |
256 |
| -; AVX512DQ-BW-FCP-NEXT: retq |
257 | 138 | %wide.vec = load <8 x i16>, ptr %in.vec, align 64
|
258 | 139 | %strided.vec0 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
259 | 140 | %strided.vec1 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
@@ -349,69 +230,6 @@ define void @load_i16_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
|
349 | 230 | ; AVX512-NEXT: vpmovdw %ymm1, (%rdx)
|
350 | 231 | ; AVX512-NEXT: vzeroupper
|
351 | 232 | ; AVX512-NEXT: retq
|
352 |
| -; |
353 |
| -; AVX512-FCP-LABEL: load_i16_stride2_vf8: |
354 |
| -; AVX512-FCP: # %bb.0: |
355 |
| -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 |
356 |
| -; AVX512-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 |
357 |
| -; AVX512-FCP-NEXT: vpmovdw %ymm0, (%rsi) |
358 |
| -; AVX512-FCP-NEXT: vpmovdw %ymm1, (%rdx) |
359 |
| -; AVX512-FCP-NEXT: vzeroupper |
360 |
| -; AVX512-FCP-NEXT: retq |
361 |
| -; |
362 |
| -; AVX512DQ-LABEL: load_i16_stride2_vf8: |
363 |
| -; AVX512DQ: # %bb.0: |
364 |
| -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 |
365 |
| -; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm1 |
366 |
| -; AVX512DQ-NEXT: vpmovdw %ymm0, (%rsi) |
367 |
| -; AVX512DQ-NEXT: vpmovdw %ymm1, (%rdx) |
368 |
| -; AVX512DQ-NEXT: vzeroupper |
369 |
| -; AVX512DQ-NEXT: retq |
370 |
| -; |
371 |
| -; AVX512DQ-FCP-LABEL: load_i16_stride2_vf8: |
372 |
| -; AVX512DQ-FCP: # %bb.0: |
373 |
| -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 |
374 |
| -; AVX512DQ-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 |
375 |
| -; AVX512DQ-FCP-NEXT: vpmovdw %ymm0, (%rsi) |
376 |
| -; AVX512DQ-FCP-NEXT: vpmovdw %ymm1, (%rdx) |
377 |
| -; AVX512DQ-FCP-NEXT: vzeroupper |
378 |
| -; AVX512DQ-FCP-NEXT: retq |
379 |
| -; |
380 |
| -; AVX512BW-LABEL: load_i16_stride2_vf8: |
381 |
| -; AVX512BW: # %bb.0: |
382 |
| -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 |
383 |
| -; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm1 |
384 |
| -; AVX512BW-NEXT: vpmovdw %ymm0, (%rsi) |
385 |
| -; AVX512BW-NEXT: vpmovdw %ymm1, (%rdx) |
386 |
| -; AVX512BW-NEXT: vzeroupper |
387 |
| -; AVX512BW-NEXT: retq |
388 |
| -; |
389 |
| -; AVX512BW-FCP-LABEL: load_i16_stride2_vf8: |
390 |
| -; AVX512BW-FCP: # %bb.0: |
391 |
| -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 |
392 |
| -; AVX512BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 |
393 |
| -; AVX512BW-FCP-NEXT: vpmovdw %ymm0, (%rsi) |
394 |
| -; AVX512BW-FCP-NEXT: vpmovdw %ymm1, (%rdx) |
395 |
| -; AVX512BW-FCP-NEXT: vzeroupper |
396 |
| -; AVX512BW-FCP-NEXT: retq |
397 |
| -; |
398 |
| -; AVX512DQ-BW-LABEL: load_i16_stride2_vf8: |
399 |
| -; AVX512DQ-BW: # %bb.0: |
400 |
| -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 |
401 |
| -; AVX512DQ-BW-NEXT: vpsrld $16, %ymm0, %ymm1 |
402 |
| -; AVX512DQ-BW-NEXT: vpmovdw %ymm0, (%rsi) |
403 |
| -; AVX512DQ-BW-NEXT: vpmovdw %ymm1, (%rdx) |
404 |
| -; AVX512DQ-BW-NEXT: vzeroupper |
405 |
| -; AVX512DQ-BW-NEXT: retq |
406 |
| -; |
407 |
| -; AVX512DQ-BW-FCP-LABEL: load_i16_stride2_vf8: |
408 |
| -; AVX512DQ-BW-FCP: # %bb.0: |
409 |
| -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 |
410 |
| -; AVX512DQ-BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 |
411 |
| -; AVX512DQ-BW-FCP-NEXT: vpmovdw %ymm0, (%rsi) |
412 |
| -; AVX512DQ-BW-FCP-NEXT: vpmovdw %ymm1, (%rdx) |
413 |
| -; AVX512DQ-BW-FCP-NEXT: vzeroupper |
414 |
| -; AVX512DQ-BW-FCP-NEXT: retq |
415 | 233 | %wide.vec = load <16 x i16>, ptr %in.vec, align 64
|
416 | 234 | %strided.vec0 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
417 | 235 | %strided.vec1 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
@@ -544,69 +362,6 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
|
544 | 362 | ; AVX512-NEXT: vpmovdw %zmm1, (%rdx)
|
545 | 363 | ; AVX512-NEXT: vzeroupper
|
546 | 364 | ; AVX512-NEXT: retq
|
547 |
| -; |
548 |
| -; AVX512-FCP-LABEL: load_i16_stride2_vf16: |
549 |
| -; AVX512-FCP: # %bb.0: |
550 |
| -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 |
551 |
| -; AVX512-FCP-NEXT: vpsrld $16, %zmm0, %zmm1 |
552 |
| -; AVX512-FCP-NEXT: vpmovdw %zmm0, (%rsi) |
553 |
| -; AVX512-FCP-NEXT: vpmovdw %zmm1, (%rdx) |
554 |
| -; AVX512-FCP-NEXT: vzeroupper |
555 |
| -; AVX512-FCP-NEXT: retq |
556 |
| -; |
557 |
| -; AVX512DQ-LABEL: load_i16_stride2_vf16: |
558 |
| -; AVX512DQ: # %bb.0: |
559 |
| -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 |
560 |
| -; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 |
561 |
| -; AVX512DQ-NEXT: vpmovdw %zmm0, (%rsi) |
562 |
| -; AVX512DQ-NEXT: vpmovdw %zmm1, (%rdx) |
563 |
| -; AVX512DQ-NEXT: vzeroupper |
564 |
| -; AVX512DQ-NEXT: retq |
565 |
| -; |
566 |
| -; AVX512DQ-FCP-LABEL: load_i16_stride2_vf16: |
567 |
| -; AVX512DQ-FCP: # %bb.0: |
568 |
| -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 |
569 |
| -; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm0, %zmm1 |
570 |
| -; AVX512DQ-FCP-NEXT: vpmovdw %zmm0, (%rsi) |
571 |
| -; AVX512DQ-FCP-NEXT: vpmovdw %zmm1, (%rdx) |
572 |
| -; AVX512DQ-FCP-NEXT: vzeroupper |
573 |
| -; AVX512DQ-FCP-NEXT: retq |
574 |
| -; |
575 |
| -; AVX512BW-LABEL: load_i16_stride2_vf16: |
576 |
| -; AVX512BW: # %bb.0: |
577 |
| -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 |
578 |
| -; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm1 |
579 |
| -; AVX512BW-NEXT: vpmovdw %zmm0, (%rsi) |
580 |
| -; AVX512BW-NEXT: vpmovdw %zmm1, (%rdx) |
581 |
| -; AVX512BW-NEXT: vzeroupper |
582 |
| -; AVX512BW-NEXT: retq |
583 |
| -; |
584 |
| -; AVX512BW-FCP-LABEL: load_i16_stride2_vf16: |
585 |
| -; AVX512BW-FCP: # %bb.0: |
586 |
| -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 |
587 |
| -; AVX512BW-FCP-NEXT: vpsrld $16, %zmm0, %zmm1 |
588 |
| -; AVX512BW-FCP-NEXT: vpmovdw %zmm0, (%rsi) |
589 |
| -; AVX512BW-FCP-NEXT: vpmovdw %zmm1, (%rdx) |
590 |
| -; AVX512BW-FCP-NEXT: vzeroupper |
591 |
| -; AVX512BW-FCP-NEXT: retq |
592 |
| -; |
593 |
| -; AVX512DQ-BW-LABEL: load_i16_stride2_vf16: |
594 |
| -; AVX512DQ-BW: # %bb.0: |
595 |
| -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 |
596 |
| -; AVX512DQ-BW-NEXT: vpsrld $16, %zmm0, %zmm1 |
597 |
| -; AVX512DQ-BW-NEXT: vpmovdw %zmm0, (%rsi) |
598 |
| -; AVX512DQ-BW-NEXT: vpmovdw %zmm1, (%rdx) |
599 |
| -; AVX512DQ-BW-NEXT: vzeroupper |
600 |
| -; AVX512DQ-BW-NEXT: retq |
601 |
| -; |
602 |
| -; AVX512DQ-BW-FCP-LABEL: load_i16_stride2_vf16: |
603 |
| -; AVX512DQ-BW-FCP: # %bb.0: |
604 |
| -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 |
605 |
| -; AVX512DQ-BW-FCP-NEXT: vpsrld $16, %zmm0, %zmm1 |
606 |
| -; AVX512DQ-BW-FCP-NEXT: vpmovdw %zmm0, (%rsi) |
607 |
| -; AVX512DQ-BW-FCP-NEXT: vpmovdw %zmm1, (%rdx) |
608 |
| -; AVX512DQ-BW-FCP-NEXT: vzeroupper |
609 |
| -; AVX512DQ-BW-FCP-NEXT: retq |
610 | 365 | %wide.vec = load <32 x i16>, ptr %in.vec, align 64
|
611 | 366 | %strided.vec0 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
|
612 | 367 | %strided.vec1 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
|
@@ -817,18 +572,18 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
|
817 | 572 | ; AVX2-FCP-NEXT: vzeroupper
|
818 | 573 | ; AVX2-FCP-NEXT: retq
|
819 | 574 | ;
|
820 |
| -; AVX512-LABEL: load_i16_stride2_vf32: |
821 |
| -; AVX512: # %bb.0: |
822 |
| -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 |
823 |
| -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 |
824 |
| -; AVX512-NEXT: vpsrld $16, %zmm0, %zmm2 |
825 |
| -; AVX512-NEXT: vpsrld $16, %zmm1, %zmm3 |
826 |
| -; AVX512-NEXT: vpmovdw %zmm1, 32(%rsi) |
827 |
| -; AVX512-NEXT: vpmovdw %zmm0, (%rsi) |
828 |
| -; AVX512-NEXT: vpmovdw %zmm3, 32(%rdx) |
829 |
| -; AVX512-NEXT: vpmovdw %zmm2, (%rdx) |
830 |
| -; AVX512-NEXT: vzeroupper |
831 |
| -; AVX512-NEXT: retq |
| 575 | +; AVX512-VL-LABEL: load_i16_stride2_vf32: |
| 576 | +; AVX512-VL: # %bb.0: |
| 577 | +; AVX512-VL-NEXT: vmovdqa64 (%rdi), %zmm0 |
| 578 | +; AVX512-VL-NEXT: vmovdqa64 64(%rdi), %zmm1 |
| 579 | +; AVX512-VL-NEXT: vpsrld $16, %zmm0, %zmm2 |
| 580 | +; AVX512-VL-NEXT: vpsrld $16, %zmm1, %zmm3 |
| 581 | +; AVX512-VL-NEXT: vpmovdw %zmm1, 32(%rsi) |
| 582 | +; AVX512-VL-NEXT: vpmovdw %zmm0, (%rsi) |
| 583 | +; AVX512-VL-NEXT: vpmovdw %zmm3, 32(%rdx) |
| 584 | +; AVX512-VL-NEXT: vpmovdw %zmm2, (%rdx) |
| 585 | +; AVX512-VL-NEXT: vzeroupper |
| 586 | +; AVX512-VL-NEXT: retq |
832 | 587 | ;
|
833 | 588 | ; AVX512-FCP-LABEL: load_i16_stride2_vf32:
|
834 | 589 | ; AVX512-FCP: # %bb.0:
|
@@ -1344,27 +1099,27 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
|
1344 | 1099 | ; AVX2-FCP-NEXT: vzeroupper
|
1345 | 1100 | ; AVX2-FCP-NEXT: retq
|
1346 | 1101 | ;
|
1347 |
| -; AVX512-LABEL: load_i16_stride2_vf64: |
1348 |
| -; AVX512: # %bb.0: |
1349 |
| -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 |
1350 |
| -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 |
1351 |
| -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 |
1352 |
| -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 |
1353 |
| -; AVX512-NEXT: vpmovdw %zmm1, %ymm4 |
1354 |
| -; AVX512-NEXT: vpsrld $16, %zmm1, %zmm1 |
1355 |
| -; AVX512-NEXT: vpsrld $16, %zmm0, %zmm5 |
1356 |
| -; AVX512-NEXT: vpsrld $16, %zmm3, %zmm6 |
1357 |
| -; AVX512-NEXT: vpsrld $16, %zmm2, %zmm7 |
1358 |
| -; AVX512-NEXT: vpmovdw %zmm0, (%rsi) |
1359 |
| -; AVX512-NEXT: vmovdqa %ymm4, 32(%rsi) |
1360 |
| -; AVX512-NEXT: vpmovdw %zmm2, 64(%rsi) |
1361 |
| -; AVX512-NEXT: vpmovdw %zmm3, 96(%rsi) |
1362 |
| -; AVX512-NEXT: vpmovdw %zmm7, 64(%rdx) |
1363 |
| -; AVX512-NEXT: vpmovdw %zmm6, 96(%rdx) |
1364 |
| -; AVX512-NEXT: vpmovdw %zmm5, (%rdx) |
1365 |
| -; AVX512-NEXT: vpmovdw %zmm1, 32(%rdx) |
1366 |
| -; AVX512-NEXT: vzeroupper |
1367 |
| -; AVX512-NEXT: retq |
| 1102 | +; AVX512-VL-LABEL: load_i16_stride2_vf64: |
| 1103 | +; AVX512-VL: # %bb.0: |
| 1104 | +; AVX512-VL-NEXT: vmovdqa64 (%rdi), %zmm0 |
| 1105 | +; AVX512-VL-NEXT: vmovdqa64 64(%rdi), %zmm1 |
| 1106 | +; AVX512-VL-NEXT: vmovdqa64 128(%rdi), %zmm2 |
| 1107 | +; AVX512-VL-NEXT: vmovdqa64 192(%rdi), %zmm3 |
| 1108 | +; AVX512-VL-NEXT: vpmovdw %zmm1, %ymm4 |
| 1109 | +; AVX512-VL-NEXT: vpsrld $16, %zmm1, %zmm1 |
| 1110 | +; AVX512-VL-NEXT: vpsrld $16, %zmm0, %zmm5 |
| 1111 | +; AVX512-VL-NEXT: vpsrld $16, %zmm3, %zmm6 |
| 1112 | +; AVX512-VL-NEXT: vpsrld $16, %zmm2, %zmm7 |
| 1113 | +; AVX512-VL-NEXT: vpmovdw %zmm0, (%rsi) |
| 1114 | +; AVX512-VL-NEXT: vmovdqa %ymm4, 32(%rsi) |
| 1115 | +; AVX512-VL-NEXT: vpmovdw %zmm2, 64(%rsi) |
| 1116 | +; AVX512-VL-NEXT: vpmovdw %zmm3, 96(%rsi) |
| 1117 | +; AVX512-VL-NEXT: vpmovdw %zmm7, 64(%rdx) |
| 1118 | +; AVX512-VL-NEXT: vpmovdw %zmm6, 96(%rdx) |
| 1119 | +; AVX512-VL-NEXT: vpmovdw %zmm5, (%rdx) |
| 1120 | +; AVX512-VL-NEXT: vpmovdw %zmm1, 32(%rdx) |
| 1121 | +; AVX512-VL-NEXT: vzeroupper |
| 1122 | +; AVX512-VL-NEXT: retq |
1368 | 1123 | ;
|
1369 | 1124 | ; AVX512-FCP-LABEL: load_i16_stride2_vf64:
|
1370 | 1125 | ; AVX512-FCP: # %bb.0:
|
|
0 commit comments