1- ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s -o -| FileCheck %s
1+ ; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s | FileCheck %s
2+ ; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s
23
34define <4 x i32 > @smmla.v4i32.v16i8 (<4 x i32 > %r , <16 x i8 > %a , <16 x i8 > %b ) {
5+ ; CHECK-LABEL: smmla.v4i32.v16i8:
6+ ; CHECK: // %bb.0: // %entry
7+ ; CHECK-NEXT: smmla v0.4s, v1.16b, v2.16b
8+ ; CHECK-NEXT: ret
49entry:
5- ; CHECK-LABEL: smmla.v4i32.v16i8
6- ; CHECK: smmla v0.4s, v1.16b, v2.16b
710 %vmmla1.i = tail call <4 x i32 > @llvm.aarch64.neon.smmla.v4i32.v16i8 (<4 x i32 > %r , <16 x i8 > %a , <16 x i8 > %b )
811 ret <4 x i32 > %vmmla1.i
912}
1013
1114define <4 x i32 > @ummla.v4i32.v16i8 (<4 x i32 > %r , <16 x i8 > %a , <16 x i8 > %b ) {
15+ ; CHECK-LABEL: ummla.v4i32.v16i8:
16+ ; CHECK: // %bb.0: // %entry
17+ ; CHECK-NEXT: ummla v0.4s, v1.16b, v2.16b
18+ ; CHECK-NEXT: ret
1219entry:
13- ; CHECK-LABEL: ummla.v4i32.v16i8
14- ; CHECK: ummla v0.4s, v1.16b, v2.16b
1520 %vmmla1.i = tail call <4 x i32 > @llvm.aarch64.neon.ummla.v4i32.v16i8 (<4 x i32 > %r , <16 x i8 > %a , <16 x i8 > %b )
1621 ret <4 x i32 > %vmmla1.i
1722}
1823
1924define <4 x i32 > @usmmla.v4i32.v16i8 (<4 x i32 > %r , <16 x i8 > %a , <16 x i8 > %b ) {
25+ ; CHECK-LABEL: usmmla.v4i32.v16i8:
26+ ; CHECK: // %bb.0: // %entry
27+ ; CHECK-NEXT: usmmla v0.4s, v1.16b, v2.16b
28+ ; CHECK-NEXT: ret
2029entry:
21- ; CHECK-LABEL: usmmla.v4i32.v16i8
22- ; CHECK: usmmla v0.4s, v1.16b, v2.16b
2330 %vusmmla1.i = tail call <4 x i32 > @llvm.aarch64.neon.usmmla.v4i32.v16i8 (<4 x i32 > %r , <16 x i8 > %a , <16 x i8 > %b ) #3
2431 ret <4 x i32 > %vusmmla1.i
2532}
2633
2734define <2 x i32 > @usdot.v2i32.v8i8 (<2 x i32 > %r , <8 x i8 > %a , <8 x i8 > %b ) {
35+ ; CHECK-LABEL: usdot.v2i32.v8i8:
36+ ; CHECK: // %bb.0: // %entry
37+ ; CHECK-NEXT: usdot v0.2s, v1.8b, v2.8b
38+ ; CHECK-NEXT: ret
2839entry:
29- ; CHECK-LABEL: usdot.v2i32.v8i8
30- ; CHECK: usdot v0.2s, v1.8b, v2.8b
3140 %vusdot1.i = tail call <2 x i32 > @llvm.aarch64.neon.usdot.v2i32.v8i8 (<2 x i32 > %r , <8 x i8 > %a , <8 x i8 > %b )
3241 ret <2 x i32 > %vusdot1.i
3342}
3443
3544define <2 x i32 > @usdot_lane.v2i32.v8i8 (<2 x i32 > %r , <8 x i8 > %a , <8 x i8 > %b ) {
45+ ; CHECK-LABEL: usdot_lane.v2i32.v8i8:
46+ ; CHECK: // %bb.0: // %entry
47+ ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
48+ ; CHECK-NEXT: usdot v0.2s, v1.8b, v2.4b[0]
49+ ; CHECK-NEXT: ret
3650entry:
37- ; CHECK-LABEL: usdot_lane.v2i32.v8i8
38- ; CHECK: usdot v0.2s, v1.8b, v2.4b[0]
3951 %0 = bitcast <8 x i8 > %b to <2 x i32 >
4052 %shuffle = shufflevector <2 x i32 > %0 , <2 x i32 > undef , <2 x i32 > zeroinitializer
4153 %1 = bitcast <2 x i32 > %shuffle to <8 x i8 >
@@ -44,9 +56,12 @@ entry:
4456}
4557
4658define <2 x i32 > @sudot_lane.v2i32.v8i8 (<2 x i32 > %r , <8 x i8 > %a , <8 x i8 > %b ) {
59+ ; CHECK-LABEL: sudot_lane.v2i32.v8i8:
60+ ; CHECK: // %bb.0: // %entry
61+ ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
62+ ; CHECK-NEXT: sudot v0.2s, v1.8b, v2.4b[0]
63+ ; CHECK-NEXT: ret
4764entry:
48- ; CHECK-LABEL: sudot_lane.v2i32.v8i8
49- ; CHECK: sudot v0.2s, v1.8b, v2.4b[0]
5065 %0 = bitcast <8 x i8 > %b to <2 x i32 >
5166 %shuffle = shufflevector <2 x i32 > %0 , <2 x i32 > undef , <2 x i32 > zeroinitializer
5267 %1 = bitcast <2 x i32 > %shuffle to <8 x i8 >
@@ -55,9 +70,11 @@ entry:
5570}
5671
5772define <2 x i32 > @usdot_lane.v2i32.v16i8 (<2 x i32 > %r , <8 x i8 > %a , <16 x i8 > %b ) {
73+ ; CHECK-LABEL: usdot_lane.v2i32.v16i8:
74+ ; CHECK: // %bb.0: // %entry
75+ ; CHECK-NEXT: usdot v0.2s, v1.8b, v2.4b[0]
76+ ; CHECK-NEXT: ret
5877entry:
59- ; CHECK-LABEL: usdot_lane.v2i32.v16i8
60- ; CHECK: usdot v0.2s, v1.8b, v2.4b[0]
6178 %0 = bitcast <16 x i8 > %b to <4 x i32 >
6279 %shuffle = shufflevector <4 x i32 > %0 , <4 x i32 > undef , <2 x i32 > zeroinitializer
6380 %1 = bitcast <2 x i32 > %shuffle to <8 x i8 >
@@ -66,9 +83,11 @@ entry:
6683}
6784
6885define <2 x i32 > @sudot_lane.v2i32.v16i8 (<2 x i32 > %r , <8 x i8 > %a , <16 x i8 > %b ) {
86+ ; CHECK-LABEL: sudot_lane.v2i32.v16i8:
87+ ; CHECK: // %bb.0: // %entry
88+ ; CHECK-NEXT: sudot v0.2s, v1.8b, v2.4b[0]
89+ ; CHECK-NEXT: ret
6990entry:
70- ; CHECK-LABEL: sudot_lane.v2i32.v16i8
71- ; CHECK: sudot v0.2s, v1.8b, v2.4b[0]
7291 %0 = bitcast <16 x i8 > %b to <4 x i32 >
7392 %shuffle = shufflevector <4 x i32 > %0 , <4 x i32 > undef , <2 x i32 > zeroinitializer
7493 %1 = bitcast <2 x i32 > %shuffle to <8 x i8 >
@@ -77,17 +96,22 @@ entry:
7796}
7897
7998define <4 x i32 > @usdot.v4i32.v16i8 (<4 x i32 > %r , <16 x i8 > %a , <16 x i8 > %b ) {
99+ ; CHECK-LABEL: usdot.v4i32.v16i8:
100+ ; CHECK: // %bb.0: // %entry
101+ ; CHECK-NEXT: usdot v0.4s, v1.16b, v2.16b
102+ ; CHECK-NEXT: ret
80103entry:
81- ; CHECK-LABEL: usdot.v4i32.v16i8
82- ; CHECK: usdot v0.4s, v1.16b, v2.16b
83104 %vusdot1.i = tail call <4 x i32 > @llvm.aarch64.neon.usdot.v4i32.v16i8 (<4 x i32 > %r , <16 x i8 > %a , <16 x i8 > %b ) #3
84105 ret <4 x i32 > %vusdot1.i
85106}
86107
87108define <4 x i32 > @usdot_lane.v4i32.v16i8 (<4 x i32 > %r , <16 x i8 > %a , <8 x i8 > %b ) {
109+ ; CHECK-LABEL: usdot_lane.v4i32.v16i8:
110+ ; CHECK: // %bb.0: // %entry
111+ ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
112+ ; CHECK-NEXT: usdot v0.4s, v1.16b, v2.4b[0]
113+ ; CHECK-NEXT: ret
88114entry:
89- ; CHECK-LABEL: usdot_lane.v4i32.v16i8
90- ; CHECK: usdot v0.4s, v1.16b, v2.4b[0]
91115 %0 = bitcast <8 x i8 > %b to <2 x i32 >
92116 %shuffle = shufflevector <2 x i32 > %0 , <2 x i32 > undef , <4 x i32 > zeroinitializer
93117 %1 = bitcast <4 x i32 > %shuffle to <16 x i8 >
@@ -96,9 +120,12 @@ entry:
96120}
97121
98122define <4 x i32 > @sudot_lane.v4i32.v16i8 (<4 x i32 > %r , <16 x i8 > %a , <8 x i8 > %b ) {
123+ ; CHECK-LABEL: sudot_lane.v4i32.v16i8:
124+ ; CHECK: // %bb.0: // %entry
125+ ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
126+ ; CHECK-NEXT: sudot v0.4s, v1.16b, v2.4b[0]
127+ ; CHECK-NEXT: ret
99128entry:
100- ; CHECK-LABEL: sudot_lane.v4i32.v16i8
101- ; CHECK: sudot v0.4s, v1.16b, v2.4b[0]
102129 %0 = bitcast <8 x i8 > %b to <2 x i32 >
103130 %shuffle = shufflevector <2 x i32 > %0 , <2 x i32 > undef , <4 x i32 > zeroinitializer
104131 %1 = bitcast <4 x i32 > %shuffle to <16 x i8 >
@@ -107,9 +134,11 @@ entry:
107134}
108135
109136define <4 x i32 > @usdot_laneq.v4i32.v16i8 (<4 x i32 > %r , <16 x i8 > %a , <16 x i8 > %b ) {
137+ ; CHECK-LABEL: usdot_laneq.v4i32.v16i8:
138+ ; CHECK: // %bb.0: // %entry
139+ ; CHECK-NEXT: usdot v0.4s, v1.16b, v2.4b[0]
140+ ; CHECK-NEXT: ret
110141entry:
111- ; CHECK-LABEL: usdot_laneq.v4i32.v16i8
112- ; CHECK: usdot v0.4s, v1.16b, v2.4b[0]
113142 %0 = bitcast <16 x i8 > %b to <4 x i32 >
114143 %shuffle = shufflevector <4 x i32 > %0 , <4 x i32 > undef , <4 x i32 > zeroinitializer
115144 %1 = bitcast <4 x i32 > %shuffle to <16 x i8 >
@@ -118,9 +147,11 @@ entry:
118147}
119148
120149define <4 x i32 > @sudot_laneq.v4i32.v16i8 (<4 x i32 > %r , <16 x i8 > %a , <16 x i8 > %b ) {
150+ ; CHECK-LABEL: sudot_laneq.v4i32.v16i8:
151+ ; CHECK: // %bb.0: // %entry
152+ ; CHECK-NEXT: sudot v0.4s, v1.16b, v2.4b[0]
153+ ; CHECK-NEXT: ret
121154entry:
122- ; CHECK-LABEL: sudot_laneq.v4i32.v16i8
123- ; CHECK: sudot v0.4s, v1.16b, v2.4b[0]
124155 %0 = bitcast <16 x i8 > %b to <4 x i32 >
125156 %shuffle = shufflevector <4 x i32 > %0 , <4 x i32 > undef , <4 x i32 > zeroinitializer
126157 %1 = bitcast <4 x i32 > %shuffle to <16 x i8 >
@@ -133,4 +164,3 @@ declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16
133164declare <4 x i32 > @llvm.aarch64.neon.usmmla.v4i32.v16i8 (<4 x i32 >, <16 x i8 >, <16 x i8 >) #2
134165declare <2 x i32 > @llvm.aarch64.neon.usdot.v2i32.v8i8 (<2 x i32 >, <8 x i8 >, <8 x i8 >) #2
135166declare <4 x i32 > @llvm.aarch64.neon.usdot.v4i32.v16i8 (<4 x i32 >, <16 x i8 >, <16 x i8 >) #2
136-
0 commit comments