1- ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes COMMON,IR,IRC
2- ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-nvcl | FileCheck %s --check-prefixes COMMON,IR,IRO
3- ; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes COMMON,PTX,PTXC
4- ; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-nvcl| FileCheck %s --check-prefixes COMMON,PTX,PTXO
1+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+ ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes IR,IRC
3+ ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-nvcl | FileCheck %s --check-prefixes IR,IRO
4+ ; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX,PTXC
5+ ; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-nvcl| FileCheck %s --check-prefixes PTX,PTXO
56; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %}
67
78target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
@@ -12,12 +13,60 @@ target triple = "nvptx64-nvidia-cuda"
1213%class.padded = type { i8 , i32 }
1314
1415; Check that nvptx-lower-args preserves arg alignment
15- ; COMMON-LABEL: load_alignment
1616define void @load_alignment (ptr nocapture readonly byval (%class.outer ) align 8 %arg ) {
17+ ; IR-LABEL: define void @load_alignment(
18+ ; IR-SAME: ptr readonly byval([[CLASS_OUTER:%.*]]) align 8 captures(none) [[ARG:%.*]]) {
19+ ; IR-NEXT: [[ENTRY:.*:]]
20+ ; IR-NEXT: [[ARG1:%.*]] = alloca [[CLASS_OUTER]], align 8
21+ ; IR-NEXT: [[ARG2:%.*]] = addrspacecast ptr [[ARG]] to ptr addrspace(101)
22+ ; IR-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[ARG1]], ptr addrspace(101) align 8 [[ARG2]], i64 24, i1 false)
23+ ; IR-NEXT: [[ARG_IDX_VAL:%.*]] = load ptr, ptr [[ARG1]], align 8
24+ ; IR-NEXT: [[ARG_IDX1:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG1]], i64 0, i32 0, i32 1
25+ ; IR-NEXT: [[ARG_IDX1_VAL:%.*]] = load ptr, ptr [[ARG_IDX1]], align 8
26+ ; IR-NEXT: [[ARG_IDX2:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG1]], i64 0, i32 1
27+ ; IR-NEXT: [[ARG_IDX2_VAL:%.*]] = load i32, ptr [[ARG_IDX2]], align 8
28+ ; IR-NEXT: [[ARG_IDX_VAL_VAL:%.*]] = load i32, ptr [[ARG_IDX_VAL]], align 4
29+ ; IR-NEXT: [[ADD_I:%.*]] = add nsw i32 [[ARG_IDX_VAL_VAL]], [[ARG_IDX2_VAL]]
30+ ; IR-NEXT: store i32 [[ADD_I]], ptr [[ARG_IDX1_VAL]], align 4
31+ ; IR-NEXT: [[TMP:%.*]] = call ptr @escape(ptr nonnull [[ARG_IDX2]])
32+ ; IR-NEXT: ret void
33+ ;
34+ ; PTX-LABEL: load_alignment(
35+ ; PTX: {
36+ ; PTX-NEXT: .local .align 8 .b8 __local_depot0[24];
37+ ; PTX-NEXT: .reg .b64 %SP;
38+ ; PTX-NEXT: .reg .b64 %SPL;
39+ ; PTX-NEXT: .reg .b32 %r<4>;
40+ ; PTX-NEXT: .reg .b64 %rd<10>;
41+ ; PTX-EMPTY:
42+ ; PTX-NEXT: // %bb.0: // %entry
43+ ; PTX-NEXT: mov.u64 %SPL, __local_depot0;
44+ ; PTX-NEXT: add.u64 %rd2, %SPL, 0;
45+ ; PTX-NEXT: ld.param.u64 %rd3, [load_alignment_param_0+16];
46+ ; PTX-NEXT: st.local.u64 [%rd2+16], %rd3;
47+ ; PTX-NEXT: ld.param.u64 %rd4, [load_alignment_param_0+8];
48+ ; PTX-NEXT: st.local.u64 [%rd2+8], %rd4;
49+ ; PTX-NEXT: ld.param.u64 %rd5, [load_alignment_param_0];
50+ ; PTX-NEXT: st.local.u64 [%rd2], %rd5;
51+ ; PTX-NEXT: add.s64 %rd6, %rd2, 16;
52+ ; PTX-NEXT: cvta.local.u64 %rd7, %rd6;
53+ ; PTX-NEXT: cvt.u32.u64 %r1, %rd3;
54+ ; PTX-NEXT: ld.u32 %r2, [%rd5];
55+ ; PTX-NEXT: add.s32 %r3, %r2, %r1;
56+ ; PTX-NEXT: st.u32 [%rd4], %r3;
57+ ; PTX-NEXT: { // callseq 0, 0
58+ ; PTX-NEXT: .param .b64 param0;
59+ ; PTX-NEXT: st.param.b64 [param0], %rd7;
60+ ; PTX-NEXT: .param .b64 retval0;
61+ ; PTX-NEXT: call.uni (retval0),
62+ ; PTX-NEXT: escape,
63+ ; PTX-NEXT: (
64+ ; PTX-NEXT: param0
65+ ; PTX-NEXT: );
66+ ; PTX-NEXT: ld.param.b64 %rd8, [retval0];
67+ ; PTX-NEXT: } // callseq 0
68+ ; PTX-NEXT: ret;
1769entry:
18- ; IR: call void @llvm.memcpy.p0.p101.i64(ptr align 8
19- ; PTX: ld.param.u64
20- ; PTX-NOT: ld.param.u8
2170 %arg.idx.val = load ptr , ptr %arg , align 8
2271 %arg.idx1 = getelementptr %class.outer , ptr %arg , i64 0 , i32 0 , i32 1
2372 %arg.idx1.val = load ptr , ptr %arg.idx1 , align 8
@@ -34,8 +83,16 @@ entry:
3483}
3584
3685; Check that nvptx-lower-args copies padding as the struct may have been a union
37- ; COMMON-LABEL: load_padding
3886define void @load_padding (ptr nocapture readonly byval (%class.padded ) %arg ) {
87+ ; IR-LABEL: define void @load_padding(
88+ ; IR-SAME: ptr readonly byval([[CLASS_PADDED:%.*]]) captures(none) [[ARG:%.*]]) {
89+ ; IR-NEXT: [[ARG1:%.*]] = alloca [[CLASS_PADDED]], align 8
90+ ; IR-NEXT: [[ARG2:%.*]] = addrspacecast ptr [[ARG]] to ptr addrspace(101)
91+ ; IR-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[ARG1]], ptr addrspace(101) align 8 [[ARG2]], i64 8, i1 false)
92+ ; IR-NEXT: [[TMP:%.*]] = call ptr @escape(ptr nonnull align 16 [[ARG1]])
93+ ; IR-NEXT: ret void
94+ ;
95+ ; PTX-LABEL: load_padding(
3996; PTX: {
4097; PTX-NEXT: .local .align 8 .b8 __local_depot1[8];
4198; PTX-NEXT: .reg .b64 %SP;
@@ -45,8 +102,8 @@ define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
45102; PTX-NEXT: // %bb.0:
46103; PTX-NEXT: mov.u64 %SPL, __local_depot1;
47104; PTX-NEXT: cvta.local.u64 %SP, %SPL;
48- ; PTX-NEXT: add.u64 %rd1, %SP, 0;
49- ; PTX-NEXT: add.u64 %rd2, %SPL, 0;
105+ ; PTX-NEXT: add.u64 %rd1, %SP, 0;
106+ ; PTX-NEXT: add.u64 %rd2, %SPL, 0;
50107; PTX-NEXT: ld.param.u64 %rd3, [load_padding_param_0];
51108; PTX-NEXT: st.local.u64 [%rd2], %rd3;
52109; PTX-NEXT: { // callseq 1, 0
@@ -65,85 +122,168 @@ define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
65122 ret void
66123}
67124
68- ; COMMON-LABEL: ptr_generic
69- define ptx_kernel void @ptr_generic (ptr %out , ptr %in ) {
70- ; IRC: %in3 = addrspacecast ptr %in to ptr addrspace(1)
71- ; IRC: %in4 = addrspacecast ptr addrspace(1) %in3 to ptr
72- ; IRC: %out1 = addrspacecast ptr %out to ptr addrspace(1)
73- ; IRC: %out2 = addrspacecast ptr addrspace(1) %out1 to ptr
74- ; PTXC: cvta.to.global.u64
75- ; PTXC: cvta.to.global.u64
76- ; PTXC: ld.global.u32
77- ; PTXC: st.global.u32
78-
79125; OpenCL can't make assumptions about incoming pointer, so we should generate
80126; generic pointers load/store.
81- ; IRO-NOT: addrspacecast
82- ; PTXO-NOT: cvta.to.global
83- ; PTXO: ld.u32
84- ; PTXO: st.u32
127+ define ptx_kernel void @ptr_generic (ptr %out , ptr %in ) {
128+ ; IRC-LABEL: define ptx_kernel void @ptr_generic(
129+ ; IRC-SAME: ptr [[OUT:%.*]], ptr [[IN:%.*]]) {
130+ ; IRC-NEXT: [[IN3:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
131+ ; IRC-NEXT: [[IN4:%.*]] = addrspacecast ptr addrspace(1) [[IN3]] to ptr
132+ ; IRC-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
133+ ; IRC-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
134+ ; IRC-NEXT: [[V:%.*]] = load i32, ptr [[IN4]], align 4
135+ ; IRC-NEXT: store i32 [[V]], ptr [[OUT2]], align 4
136+ ; IRC-NEXT: ret void
137+ ;
138+ ; IRO-LABEL: define ptx_kernel void @ptr_generic(
139+ ; IRO-SAME: ptr [[OUT:%.*]], ptr [[IN:%.*]]) {
140+ ; IRO-NEXT: [[V:%.*]] = load i32, ptr [[IN]], align 4
141+ ; IRO-NEXT: store i32 [[V]], ptr [[OUT]], align 4
142+ ; IRO-NEXT: ret void
143+ ;
144+ ; PTXC-LABEL: ptr_generic(
145+ ; PTXC: {
146+ ; PTXC-NEXT: .reg .b32 %r<2>;
147+ ; PTXC-NEXT: .reg .b64 %rd<5>;
148+ ; PTXC-EMPTY:
149+ ; PTXC-NEXT: // %bb.0:
150+ ; PTXC-NEXT: ld.param.u64 %rd1, [ptr_generic_param_0];
151+ ; PTXC-NEXT: ld.param.u64 %rd2, [ptr_generic_param_1];
152+ ; PTXC-NEXT: cvta.to.global.u64 %rd3, %rd2;
153+ ; PTXC-NEXT: cvta.to.global.u64 %rd4, %rd1;
154+ ; PTXC-NEXT: ld.global.u32 %r1, [%rd3];
155+ ; PTXC-NEXT: st.global.u32 [%rd4], %r1;
156+ ; PTXC-NEXT: ret;
157+ ;
158+ ; PTXO-LABEL: ptr_generic(
159+ ; PTXO: {
160+ ; PTXO-NEXT: .reg .b32 %r<2>;
161+ ; PTXO-NEXT: .reg .b64 %rd<3>;
162+ ; PTXO-EMPTY:
163+ ; PTXO-NEXT: // %bb.0:
164+ ; PTXO-NEXT: ld.param.u64 %rd1, [ptr_generic_param_0];
165+ ; PTXO-NEXT: ld.param.u64 %rd2, [ptr_generic_param_1];
166+ ; PTXO-NEXT: ld.u32 %r1, [%rd2];
167+ ; PTXO-NEXT: st.u32 [%rd1], %r1;
168+ ; PTXO-NEXT: ret;
85169 %v = load i32 , ptr %in , align 4
86170 store i32 %v , ptr %out , align 4
87171 ret void
88172}
89173
90- ; COMMON-LABEL: ptr_nongeneric
91174define ptx_kernel void @ptr_nongeneric (ptr addrspace (1 ) %out , ptr addrspace (3 ) %in ) {
92- ; IR-NOT: addrspacecast
93- ; PTX-NOT: cvta.to.global
94- ; PTX: ld.shared.u32
95- ; PTX st.global.u32
175+ ; IR-LABEL: define ptx_kernel void @ptr_nongeneric(
176+ ; IR-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) {
177+ ; IR-NEXT: [[V:%.*]] = load i32, ptr addrspace(3) [[IN]], align 4
178+ ; IR-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
179+ ; IR-NEXT: ret void
180+ ;
181+ ; PTX-LABEL: ptr_nongeneric(
182+ ; PTX: {
183+ ; PTX-NEXT: .reg .b32 %r<2>;
184+ ; PTX-NEXT: .reg .b64 %rd<3>;
185+ ; PTX-EMPTY:
186+ ; PTX-NEXT: // %bb.0:
187+ ; PTX-NEXT: ld.param.u64 %rd1, [ptr_nongeneric_param_0];
188+ ; PTX-NEXT: ld.param.u64 %rd2, [ptr_nongeneric_param_1];
189+ ; PTX-NEXT: ld.shared.u32 %r1, [%rd2];
190+ ; PTX-NEXT: st.global.u32 [%rd1], %r1;
191+ ; PTX-NEXT: ret;
96192 %v = load i32 , ptr addrspace (3 ) %in , align 4
97193 store i32 %v , ptr addrspace (1 ) %out , align 4
98194 ret void
99195}
100196
101- ; COMMON-LABEL: ptr_as_int
102- define ptx_kernel void @ptr_as_int (i64 noundef %i , i32 noundef %v ) {
103- ; IR: [[P:%.*]] = inttoptr i64 %i to ptr
104- ; IRC: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
105- ; IRC: addrspacecast ptr addrspace(1) [[P1]] to ptr
106- ; IRO-NOT: addrspacecast
107-
108- ; PTXC-DAG: ld.param.u64 [[I:%rd.*]], [ptr_as_int_param_0];
109- ; PTXC-DAG: ld.param.u32 [[V:%r.*]], [ptr_as_int_param_1];
110- ; PTXC: cvta.to.global.u64 %[[P:rd.*]], [[I]];
111- ; PTXC: st.global.u32 [%[[P]]], [[V]];
112-
113- ; PTXO-DAG: ld.param.u64 %[[P:rd.*]], [ptr_as_int_param_0];
114- ; PTXO-DAG: ld.param.u32 [[V:%r.*]], [ptr_as_int_param_1];
115- ; PTXO: st.u32 [%[[P]]], [[V]];
116-
197+ define ptx_kernel void @ptr_as_int (i64 noundef %i , i32 noundef %v ) {
198+ ; IRC-LABEL: define ptx_kernel void @ptr_as_int(
199+ ; IRC-SAME: i64 noundef [[I:%.*]], i32 noundef [[V:%.*]]) {
200+ ; IRC-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
201+ ; IRC-NEXT: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
202+ ; IRC-NEXT: [[P2:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr
203+ ; IRC-NEXT: store i32 [[V]], ptr [[P2]], align 4
204+ ; IRC-NEXT: ret void
205+ ;
206+ ; IRO-LABEL: define ptx_kernel void @ptr_as_int(
207+ ; IRO-SAME: i64 noundef [[I:%.*]], i32 noundef [[V:%.*]]) {
208+ ; IRO-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
209+ ; IRO-NEXT: store i32 [[V]], ptr [[P]], align 4
210+ ; IRO-NEXT: ret void
211+ ;
212+ ; PTXC-LABEL: ptr_as_int(
213+ ; PTXC: {
214+ ; PTXC-NEXT: .reg .b32 %r<2>;
215+ ; PTXC-NEXT: .reg .b64 %rd<3>;
216+ ; PTXC-EMPTY:
217+ ; PTXC-NEXT: // %bb.0:
218+ ; PTXC-NEXT: ld.param.u64 %rd1, [ptr_as_int_param_0];
219+ ; PTXC-NEXT: ld.param.u32 %r1, [ptr_as_int_param_1];
220+ ; PTXC-NEXT: cvta.to.global.u64 %rd2, %rd1;
221+ ; PTXC-NEXT: st.global.u32 [%rd2], %r1;
222+ ; PTXC-NEXT: ret;
223+ ;
224+ ; PTXO-LABEL: ptr_as_int(
225+ ; PTXO: {
226+ ; PTXO-NEXT: .reg .b32 %r<2>;
227+ ; PTXO-NEXT: .reg .b64 %rd<2>;
228+ ; PTXO-EMPTY:
229+ ; PTXO-NEXT: // %bb.0:
230+ ; PTXO-NEXT: ld.param.u64 %rd1, [ptr_as_int_param_0];
231+ ; PTXO-NEXT: ld.param.u32 %r1, [ptr_as_int_param_1];
232+ ; PTXO-NEXT: st.u32 [%rd1], %r1;
233+ ; PTXO-NEXT: ret;
117234 %p = inttoptr i64 %i to ptr
118235 store i32 %v , ptr %p , align 4
119236 ret void
120237}
121238
122239%struct.S = type { i64 }
123240
124- ; COMMON-LABEL: ptr_as_int_aggr
125241define ptx_kernel void @ptr_as_int_aggr (ptr nocapture noundef readonly byval (%struct.S ) align 8 %s , i32 noundef %v ) {
126- ; IR: [[S:%.*]] = addrspacecast ptr %s to ptr addrspace(101)
127- ; IR: [[I:%.*]] = load i64, ptr addrspace(101) [[S]], align 8
128- ; IR: [[P0:%.*]] = inttoptr i64 [[I]] to ptr
129- ; IRC: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
130- ; IRC: [[P:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr
131- ; IRO-NOT: addrspacecast
132-
133- ; PTXC-DAG: ld.param.u64 [[I:%rd.*]], [ptr_as_int_aggr_param_0];
134- ; PTXC-DAG: ld.param.u32 [[V:%r.*]], [ptr_as_int_aggr_param_1];
135- ; PTXC: cvta.to.global.u64 %[[P:rd.*]], [[I]];
136- ; PTXC: st.global.u32 [%[[P]]], [[V]];
137-
138- ; PTXO-DAG: ld.param.u64 %[[P:rd.*]], [ptr_as_int_aggr_param_0];
139- ; PTXO-DAG: ld.param.u32 [[V:%r.*]], [ptr_as_int_aggr_param_1];
140- ; PTXO: st.u32 [%[[P]]], [[V]];
242+ ; IRC-LABEL: define ptx_kernel void @ptr_as_int_aggr(
243+ ; IRC-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) {
244+ ; IRC-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
245+ ; IRC-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S3]], align 8
246+ ; IRC-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
247+ ; IRC-NEXT: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
248+ ; IRC-NEXT: [[P2:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr
249+ ; IRC-NEXT: store i32 [[V]], ptr [[P2]], align 4
250+ ; IRC-NEXT: ret void
251+ ;
252+ ; IRO-LABEL: define ptx_kernel void @ptr_as_int_aggr(
253+ ; IRO-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) {
254+ ; IRO-NEXT: [[S1:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
255+ ; IRO-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S1]], align 8
256+ ; IRO-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
257+ ; IRO-NEXT: store i32 [[V]], ptr [[P]], align 4
258+ ; IRO-NEXT: ret void
259+ ;
260+ ; PTXC-LABEL: ptr_as_int_aggr(
261+ ; PTXC: {
262+ ; PTXC-NEXT: .reg .b32 %r<2>;
263+ ; PTXC-NEXT: .reg .b64 %rd<3>;
264+ ; PTXC-EMPTY:
265+ ; PTXC-NEXT: // %bb.0:
266+ ; PTXC-NEXT: ld.param.u32 %r1, [ptr_as_int_aggr_param_1];
267+ ; PTXC-NEXT: ld.param.u64 %rd1, [ptr_as_int_aggr_param_0];
268+ ; PTXC-NEXT: cvta.to.global.u64 %rd2, %rd1;
269+ ; PTXC-NEXT: st.global.u32 [%rd2], %r1;
270+ ; PTXC-NEXT: ret;
271+ ;
272+ ; PTXO-LABEL: ptr_as_int_aggr(
273+ ; PTXO: {
274+ ; PTXO-NEXT: .reg .b32 %r<2>;
275+ ; PTXO-NEXT: .reg .b64 %rd<2>;
276+ ; PTXO-EMPTY:
277+ ; PTXO-NEXT: // %bb.0:
278+ ; PTXO-NEXT: ld.param.u32 %r1, [ptr_as_int_aggr_param_1];
279+ ; PTXO-NEXT: ld.param.u64 %rd1, [ptr_as_int_aggr_param_0];
280+ ; PTXO-NEXT: st.u32 [%rd1], %r1;
281+ ; PTXO-NEXT: ret;
141282 %i = load i64 , ptr %s , align 8
142283 %p = inttoptr i64 %i to ptr
143284 store i32 %v , ptr %p , align 4
144285 ret void
145286}
146287
147-
148288; Function Attrs: convergent nounwind
149289declare dso_local ptr @escape (ptr ) local_unnamed_addr
0 commit comments