11; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2- ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s
2+ ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
3+ ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
34
45define <vscale x 4 x float > @fdot_wide_vl128 (<vscale x 4 x float > %acc , <vscale x 8 x half > %a , <vscale x 8 x half > %b ) {
5- ; CHECK-LABEL: fdot_wide_vl128:
6- ; CHECK: // %bb.0: // %entry
7- ; CHECK-NEXT: fdot z0.s, z1.h, z2.h
8- ; CHECK-NEXT: ret
6+ ; SVE2-LABEL: fdot_wide_vl128:
7+ ; SVE2: // %bb.0: // %entry
8+ ; SVE2-NEXT: uunpklo z3.s, z1.h
9+ ; SVE2-NEXT: uunpklo z4.s, z2.h
10+ ; SVE2-NEXT: ptrue p0.s
11+ ; SVE2-NEXT: uunpkhi z1.s, z1.h
12+ ; SVE2-NEXT: uunpkhi z2.s, z2.h
13+ ; SVE2-NEXT: fcvt z3.s, p0/m, z3.h
14+ ; SVE2-NEXT: fcvt z4.s, p0/m, z4.h
15+ ; SVE2-NEXT: fcvt z1.s, p0/m, z1.h
16+ ; SVE2-NEXT: fcvt z2.s, p0/m, z2.h
17+ ; SVE2-NEXT: fmul z3.s, z3.s, z4.s
18+ ; SVE2-NEXT: fmul z1.s, z1.s, z2.s
19+ ; SVE2-NEXT: fadd z0.s, z0.s, z3.s
20+ ; SVE2-NEXT: fadd z0.s, z0.s, z1.s
21+ ; SVE2-NEXT: ret
22+ ;
23+ ; SVE2P1-LABEL: fdot_wide_vl128:
24+ ; SVE2P1: // %bb.0: // %entry
25+ ; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h
26+ ; SVE2P1-NEXT: ret
927entry:
1028 %a.wide = fpext <vscale x 8 x half > %a to <vscale x 8 x float >
1129 %b.wide = fpext <vscale x 8 x half > %b to <vscale x 8 x float >
@@ -15,26 +33,56 @@ entry:
1533}
1634
1735define <vscale x 4 x float > @fdot_splat_vl128 (<vscale x 4 x float > %acc , <vscale x 8 x half > %a ) {
18- ; CHECK-LABEL: fdot_splat_vl128:
19- ; CHECK: // %bb.0: // %entry
20- ; CHECK-NEXT: fmov z2.h, #1.00000000
21- ; CHECK-NEXT: fdot z0.s, z1.h, z2.h
22- ; CHECK-NEXT: ret
36+ ; SVE2-LABEL: fdot_splat_vl128:
37+ ; SVE2: // %bb.0: // %entry
38+ ; SVE2-NEXT: uunpklo z2.s, z1.h
39+ ; SVE2-NEXT: ptrue p0.s
40+ ; SVE2-NEXT: uunpkhi z1.s, z1.h
41+ ; SVE2-NEXT: fcvt z2.s, p0/m, z2.h
42+ ; SVE2-NEXT: fcvt z1.s, p0/m, z1.h
43+ ; SVE2-NEXT: fadd z0.s, z0.s, z2.s
44+ ; SVE2-NEXT: fadd z0.s, z0.s, z1.s
45+ ; SVE2-NEXT: ret
46+ ;
47+ ; SVE2P1-LABEL: fdot_splat_vl128:
48+ ; SVE2P1: // %bb.0: // %entry
49+ ; SVE2P1-NEXT: fmov z2.h, #1.00000000
50+ ; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h
51+ ; SVE2P1-NEXT: ret
2352entry:
2453 %a.wide = fpext <vscale x 8 x half > %a to <vscale x 8 x float >
2554 %partial.reduce = call <vscale x 4 x float > @llvm.vector.partial.reduce.fadd (<vscale x 4 x float > %acc , <vscale x 8 x float > %a.wide )
2655 ret <vscale x 4 x float > %partial.reduce
2756}
2857
2958define void @fdot_wide_vl256 (ptr %accptr , ptr %aptr , ptr %bptr ) vscale_range(2 ,2 ) {
30- ; CHECK-LABEL: fdot_wide_vl256:
31- ; CHECK: // %bb.0: // %entry
32- ; CHECK-NEXT: ldr z0, [x0]
33- ; CHECK-NEXT: ldr z1, [x1]
34- ; CHECK-NEXT: ldr z2, [x2]
35- ; CHECK-NEXT: fdot z0.s, z1.h, z2.h
36- ; CHECK-NEXT: str z0, [x0]
37- ; CHECK-NEXT: ret
59+ ; SVE2-LABEL: fdot_wide_vl256:
60+ ; SVE2: // %bb.0: // %entry
61+ ; SVE2-NEXT: ptrue p0.s
62+ ; SVE2-NEXT: ld1h { z0.s }, p0/z, [x1]
63+ ; SVE2-NEXT: ld1h { z1.s }, p0/z, [x2]
64+ ; SVE2-NEXT: ld1h { z2.s }, p0/z, [x1, #1, mul vl]
65+ ; SVE2-NEXT: ld1h { z3.s }, p0/z, [x2, #1, mul vl]
66+ ; SVE2-NEXT: fcvt z0.s, p0/m, z0.h
67+ ; SVE2-NEXT: fcvt z1.s, p0/m, z1.h
68+ ; SVE2-NEXT: fcvt z2.s, p0/m, z2.h
69+ ; SVE2-NEXT: fcvt z3.s, p0/m, z3.h
70+ ; SVE2-NEXT: fmul z0.s, z0.s, z1.s
71+ ; SVE2-NEXT: ldr z1, [x0]
72+ ; SVE2-NEXT: fmul z2.s, z2.s, z3.s
73+ ; SVE2-NEXT: fadd z0.s, z1.s, z0.s
74+ ; SVE2-NEXT: fadd z0.s, z0.s, z2.s
75+ ; SVE2-NEXT: str z0, [x0]
76+ ; SVE2-NEXT: ret
77+ ;
78+ ; SVE2P1-LABEL: fdot_wide_vl256:
79+ ; SVE2P1: // %bb.0: // %entry
80+ ; SVE2P1-NEXT: ldr z0, [x0]
81+ ; SVE2P1-NEXT: ldr z1, [x1]
82+ ; SVE2P1-NEXT: ldr z2, [x2]
83+ ; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h
84+ ; SVE2P1-NEXT: str z0, [x0]
85+ ; SVE2P1-NEXT: ret
3886entry:
3987 %acc = load <8 x float >, ptr %accptr
4088 %a = load <16 x half >, ptr %aptr
0 commit comments