Skip to content

Commit cb70a27

Browse files
committed
[AArch64][GlobalISel] Added usdot intrinsic support
GlobalISel now selects usdot intrinsic, without falling back to SDAG.
1 parent 45495b5 commit cb70a27

File tree

3 files changed

+67
-28
lines changed

3 files changed

+67
-28
lines changed

llvm/lib/Target/AArch64/AArch64InstrGISel.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,12 @@ def G_SDOT : AArch64GenericInstruction {
233233
let hasSideEffects = 0;
234234
}
235235

236+
def G_USDOT : AArch64GenericInstruction {
237+
let OutOperandList = (outs type0:$dst);
238+
let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3);
239+
let hasSideEffects = 0;
240+
}
241+
236242
// Generic instruction for the BSP pseudo. It is expanded into BSP, which
237243
// expands into BSL/BIT/BIF after register allocation.
238244
def G_BSP : AArch64GenericInstruction {
@@ -278,6 +284,7 @@ def : GINodeEquiv<G_UADDLV, AArch64uaddlv>;
278284

279285
def : GINodeEquiv<G_UDOT, AArch64udot>;
280286
def : GINodeEquiv<G_SDOT, AArch64sdot>;
287+
def : GINodeEquiv<G_USDOT, AArch64usdot>;
281288

282289
def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
283290

llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1855,6 +1855,8 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
18551855
return LowerTriOp(AArch64::G_UDOT);
18561856
case Intrinsic::aarch64_neon_sdot:
18571857
return LowerTriOp(AArch64::G_SDOT);
1858+
case Intrinsic::aarch64_neon_usdot:
1859+
return LowerTriOp(AArch64::G_USDOT);
18581860
case Intrinsic::aarch64_neon_sqxtn:
18591861
return LowerUnaryOp(TargetOpcode::G_TRUNC_SSAT_S);
18601862
case Intrinsic::aarch64_neon_sqxtun:

llvm/test/CodeGen/AArch64/aarch64-matmul.ll

Lines changed: 58 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,53 @@
1-
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s -o -| FileCheck %s
1+
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s | FileCheck %s
2+
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s
23

34
define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
5+
; CHECK-LABEL: smmla.v4i32.v16i8:
6+
; CHECK: // %bb.0: // %entry
7+
; CHECK-NEXT: smmla v0.4s, v1.16b, v2.16b
8+
; CHECK-NEXT: ret
49
entry:
5-
; CHECK-LABEL: smmla.v4i32.v16i8
6-
; CHECK: smmla v0.4s, v1.16b, v2.16b
710
%vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
811
ret <4 x i32> %vmmla1.i
912
}
1013

1114
define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
15+
; CHECK-LABEL: ummla.v4i32.v16i8:
16+
; CHECK: // %bb.0: // %entry
17+
; CHECK-NEXT: ummla v0.4s, v1.16b, v2.16b
18+
; CHECK-NEXT: ret
1219
entry:
13-
; CHECK-LABEL: ummla.v4i32.v16i8
14-
; CHECK: ummla v0.4s, v1.16b, v2.16b
1520
%vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
1621
ret <4 x i32> %vmmla1.i
1722
}
1823

1924
define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
25+
; CHECK-LABEL: usmmla.v4i32.v16i8:
26+
; CHECK: // %bb.0: // %entry
27+
; CHECK-NEXT: usmmla v0.4s, v1.16b, v2.16b
28+
; CHECK-NEXT: ret
2029
entry:
21-
; CHECK-LABEL: usmmla.v4i32.v16i8
22-
; CHECK: usmmla v0.4s, v1.16b, v2.16b
2330
%vusmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
2431
ret <4 x i32> %vusmmla1.i
2532
}
2633

2734
define <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
35+
; CHECK-LABEL: usdot.v2i32.v8i8:
36+
; CHECK: // %bb.0: // %entry
37+
; CHECK-NEXT: usdot v0.2s, v1.8b, v2.8b
38+
; CHECK-NEXT: ret
2839
entry:
29-
; CHECK-LABEL: usdot.v2i32.v8i8
30-
; CHECK: usdot v0.2s, v1.8b, v2.8b
3140
%vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
3241
ret <2 x i32> %vusdot1.i
3342
}
3443

3544
define <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
45+
; CHECK-LABEL: usdot_lane.v2i32.v8i8:
46+
; CHECK: // %bb.0: // %entry
47+
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
48+
; CHECK-NEXT: usdot v0.2s, v1.8b, v2.4b[0]
49+
; CHECK-NEXT: ret
3650
entry:
37-
; CHECK-LABEL: usdot_lane.v2i32.v8i8
38-
; CHECK: usdot v0.2s, v1.8b, v2.4b[0]
3951
%0 = bitcast <8 x i8> %b to <2 x i32>
4052
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
4153
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -44,9 +56,12 @@ entry:
4456
}
4557

4658
define <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
59+
; CHECK-LABEL: sudot_lane.v2i32.v8i8:
60+
; CHECK: // %bb.0: // %entry
61+
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
62+
; CHECK-NEXT: sudot v0.2s, v1.8b, v2.4b[0]
63+
; CHECK-NEXT: ret
4764
entry:
48-
; CHECK-LABEL: sudot_lane.v2i32.v8i8
49-
; CHECK: sudot v0.2s, v1.8b, v2.4b[0]
5065
%0 = bitcast <8 x i8> %b to <2 x i32>
5166
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
5267
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -55,9 +70,11 @@ entry:
5570
}
5671

5772
define <2 x i32> @usdot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
73+
; CHECK-LABEL: usdot_lane.v2i32.v16i8:
74+
; CHECK: // %bb.0: // %entry
75+
; CHECK-NEXT: usdot v0.2s, v1.8b, v2.4b[0]
76+
; CHECK-NEXT: ret
5877
entry:
59-
; CHECK-LABEL: usdot_lane.v2i32.v16i8
60-
; CHECK: usdot v0.2s, v1.8b, v2.4b[0]
6178
%0 = bitcast <16 x i8> %b to <4 x i32>
6279
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
6380
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -66,9 +83,11 @@ entry:
6683
}
6784

6885
define <2 x i32> @sudot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
86+
; CHECK-LABEL: sudot_lane.v2i32.v16i8:
87+
; CHECK: // %bb.0: // %entry
88+
; CHECK-NEXT: sudot v0.2s, v1.8b, v2.4b[0]
89+
; CHECK-NEXT: ret
6990
entry:
70-
; CHECK-LABEL: sudot_lane.v2i32.v16i8
71-
; CHECK: sudot v0.2s, v1.8b, v2.4b[0]
7291
%0 = bitcast <16 x i8> %b to <4 x i32>
7392
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
7493
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -77,17 +96,22 @@ entry:
7796
}
7897

7998
define <4 x i32> @usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
99+
; CHECK-LABEL: usdot.v4i32.v16i8:
100+
; CHECK: // %bb.0: // %entry
101+
; CHECK-NEXT: usdot v0.4s, v1.16b, v2.16b
102+
; CHECK-NEXT: ret
80103
entry:
81-
; CHECK-LABEL: usdot.v4i32.v16i8
82-
; CHECK: usdot v0.4s, v1.16b, v2.16b
83104
%vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
84105
ret <4 x i32> %vusdot1.i
85106
}
86107

87108
define <4 x i32> @usdot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
109+
; CHECK-LABEL: usdot_lane.v4i32.v16i8:
110+
; CHECK: // %bb.0: // %entry
111+
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
112+
; CHECK-NEXT: usdot v0.4s, v1.16b, v2.4b[0]
113+
; CHECK-NEXT: ret
88114
entry:
89-
; CHECK-LABEL: usdot_lane.v4i32.v16i8
90-
; CHECK: usdot v0.4s, v1.16b, v2.4b[0]
91115
%0 = bitcast <8 x i8> %b to <2 x i32>
92116
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
93117
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -96,9 +120,12 @@ entry:
96120
}
97121

98122
define <4 x i32> @sudot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
123+
; CHECK-LABEL: sudot_lane.v4i32.v16i8:
124+
; CHECK: // %bb.0: // %entry
125+
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
126+
; CHECK-NEXT: sudot v0.4s, v1.16b, v2.4b[0]
127+
; CHECK-NEXT: ret
99128
entry:
100-
; CHECK-LABEL: sudot_lane.v4i32.v16i8
101-
; CHECK: sudot v0.4s, v1.16b, v2.4b[0]
102129
%0 = bitcast <8 x i8> %b to <2 x i32>
103130
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
104131
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -107,9 +134,11 @@ entry:
107134
}
108135

109136
define <4 x i32> @usdot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
137+
; CHECK-LABEL: usdot_laneq.v4i32.v16i8:
138+
; CHECK: // %bb.0: // %entry
139+
; CHECK-NEXT: usdot v0.4s, v1.16b, v2.4b[0]
140+
; CHECK-NEXT: ret
110141
entry:
111-
; CHECK-LABEL: usdot_laneq.v4i32.v16i8
112-
; CHECK: usdot v0.4s, v1.16b, v2.4b[0]
113142
%0 = bitcast <16 x i8> %b to <4 x i32>
114143
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
115144
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -118,9 +147,11 @@ entry:
118147
}
119148

120149
define <4 x i32> @sudot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
150+
; CHECK-LABEL: sudot_laneq.v4i32.v16i8:
151+
; CHECK: // %bb.0: // %entry
152+
; CHECK-NEXT: sudot v0.4s, v1.16b, v2.4b[0]
153+
; CHECK-NEXT: ret
121154
entry:
122-
; CHECK-LABEL: sudot_laneq.v4i32.v16i8
123-
; CHECK: sudot v0.4s, v1.16b, v2.4b[0]
124155
%0 = bitcast <16 x i8> %b to <4 x i32>
125156
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
126157
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -133,4 +164,3 @@ declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16
133164
declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
134165
declare <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2
135166
declare <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
136-

0 commit comments

Comments
 (0)