1111
1212#ifdef cl_khr_fp16
1313#pragma OPENCL EXTENSION cl_khr_fp16 : enable
14- struct out_16 {
15- short x , y , z , w ;
16- };
1714#endif
1815
1916#ifdef cl_khr_3d_image_writes
2017#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
2118#endif
2219
23- struct out_32 {
24- int x , y , z , w ;
25- };
26-
2720// CLC helpers
2821int __clc__sampler_extract_normalized_coords_prop (int ) __asm(
2922 "__clc__sampler_extract_normalized_coords_prop" );
@@ -57,73 +50,79 @@ int __clc__sampled_image3d_unpack_sampler(__ocl_sampled_image3d_ro_t) __asm(
5750 "__clc__sampled_image_unpack_sampler" );
5851
5952// NVVM helpers
60- struct out_16
61- __nvvm_suld_1d_v4i16_trap_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i16_trap_s" );
62- struct out_16
53+ #ifdef cl_khr_fp16
54+ short4
55+ __nvvm_suld_1d_v4i16_trap_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i16_trap" );
56+ short4
6357__nvvm_suld_2d_v4i16_trap_s (long , int ,
6458 int ) __asm("__clc_llvm_nvvm_suld_2d_v4i16_trap" );
65- struct out_16
59+ short4
6660__nvvm_suld_3d_v4i16_trap_s (long , int , int ,
6761 int ) __asm("__clc_llvm_nvvm_suld_3d_v4i16_trap" );
68- struct out_32
69- __nvvm_suld_1d_v4i32_trap_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i32_trap" );
70- struct out_32
71- __nvvm_suld_2d_v4i32_trap_s (long , int ,
72- int ) __asm("__clc_llvm_nvvm_suld_2d_v4i32_trap" );
73- struct out_32
74- __nvvm_suld_3d_v4i32_trap_s (long , int , int ,
75- int ) __asm("__clc_llvm_nvvm_suld_3d_v4i32_trap" );
7662
77- struct out_16
63+ short4
7864__nvvm_suld_1d_v4i16_clamp_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i16_clamp" );
79- struct out_16
65+ short4
8066__nvvm_suld_2d_v4i16_clamp_s (long , int ,
8167 int ) __asm("__clc_llvm_nvvm_suld_2d_v4i16_clamp" );
82- struct out_16
68+ short4
8369__nvvm_suld_3d_v4i16_clamp_s (long , int , int ,
8470 int ) __asm("__clc_llvm_nvvm_suld_3d_v4i16_clamp" );
85- struct out_32
86- __nvvm_suld_1d_v4i32_clamp_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i32_clamp" );
87- struct out_32
88- __nvvm_suld_2d_v4i32_clamp_s (long , int ,
89- int ) __asm("__clc_llvm_nvvm_suld_2d_v4i32_clamp" );
90- struct out_32
91- __nvvm_suld_3d_v4i32_clamp_s (long , int , int ,
92- int ) __asm("__clc_llvm_nvvm_suld_3d_v4i32_clamp" );
9371
94- struct out_16
72+ short4
9573__nvvm_suld_1d_v4i16_zero_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i16_zero" );
96- struct out_16
74+ short4
9775__nvvm_suld_2d_v4i16_zero_s (long , int ,
9876 int ) __asm("__clc_llvm_nvvm_suld_2d_v4i16_zero" );
99- struct out_16
77+ short4
10078__nvvm_suld_3d_v4i16_zero_s (long , int , int ,
10179 int ) __asm("__clc_llvm_nvvm_suld_3d_v4i16_zero" );
102- struct out_32
103- __nvvm_suld_1d_v4i32_zero_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i32_zero" );
104- struct out_32
105- __nvvm_suld_2d_v4i32_zero_s (long , int ,
106- int ) __asm("__clc_llvm_nvvm_suld_2d_v4i32_zero" );
107- struct out_32
108- __nvvm_suld_3d_v4i32_zero_s (long , int , int ,
109- int ) __asm("__clc_llvm_nvvm_suld_3d_v4i32_zero" );
11080
111- struct out_16
81+ short4
11282__nvvm_suld_1d_v4i16_clamp (read_only image1d_t ,
11383 int ) __asm("__clc_llvm_nvvm_suld_1d_v4i16_clamp" );
114- struct out_16
84+ short4
11585__nvvm_suld_2d_v4i16_clamp (read_only image2d_t , int ,
11686 int ) __asm("__clc_llvm_nvvm_suld_2d_v4i16_clamp" );
117- struct out_16
87+ short4
11888__nvvm_suld_3d_v4i16_clamp (read_only image3d_t , int , int ,
11989 int ) __asm("__clc_llvm_nvvm_suld_3d_v4i16_clamp" );
120- struct out_32
90+ #endif
91+
92+ int4
93+ __nvvm_suld_1d_v4i32_trap_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i32_trap" );
94+ int4
95+ __nvvm_suld_2d_v4i32_trap_s (long , int ,
96+ int ) __asm("__clc_llvm_nvvm_suld_2d_v4i32_trap" );
97+ int4
98+ __nvvm_suld_3d_v4i32_trap_s (long , int , int ,
99+ int ) __asm("__clc_llvm_nvvm_suld_3d_v4i32_trap" );
100+
101+ int4
102+ __nvvm_suld_1d_v4i32_clamp_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i32_clamp" );
103+ int4
104+ __nvvm_suld_2d_v4i32_clamp_s (long , int ,
105+ int ) __asm("__clc_llvm_nvvm_suld_2d_v4i32_clamp" );
106+ int4
107+ __nvvm_suld_3d_v4i32_clamp_s (long , int , int ,
108+ int ) __asm("__clc_llvm_nvvm_suld_3d_v4i32_clamp" );
109+
110+ int4
111+ __nvvm_suld_1d_v4i32_zero_s (long , int ) __asm("__clc_llvm_nvvm_suld_1d_v4i32_zero" );
112+ int4
113+ __nvvm_suld_2d_v4i32_zero_s (long , int ,
114+ int ) __asm("__clc_llvm_nvvm_suld_2d_v4i32_zero" );
115+ int4
116+ __nvvm_suld_3d_v4i32_zero_s (long , int , int ,
117+ int ) __asm("__clc_llvm_nvvm_suld_3d_v4i32_zero" );
118+
119+ int4
121120__nvvm_suld_1d_v4i32_clamp (read_only image1d_t ,
122121 int ) __asm("__clc_llvm_nvvm_suld_1d_v4i32_clamp" );
123- struct out_32
122+ int4
124123__nvvm_suld_2d_v4i32_clamp (read_only image2d_t , int ,
125124 int ) __asm("__clc_llvm_nvvm_suld_2d_v4i32_clamp" );
126- struct out_32
125+ int4
127126__nvvm_suld_3d_v4i32_clamp (read_only image3d_t , int , int ,
128127 int ) __asm("__clc_llvm_nvvm_suld_3d_v4i32_clamp" );
129128
@@ -199,11 +198,8 @@ typedef float4 pixelf32;
199198typedef half fp16 ;
200199typedef float fp32 ;
201200
202- #define _DEFINE_OUT_TYPE (elem_t , elem_size ) \
203- inline elem_t##4 out_##elem_t(struct out_##elem_size out) { \
204- return (elem_t##4)(as_##elem_t(out.x), as_##elem_t(out.y), \
205- as_##elem_t(out.z), as_##elem_t(out.w)); \
206- }
201+ pixelf16 as_pixelf16 (short4 v ) { return as_half4 (v ); }
202+ pixelf32 as_pixelf32 (int4 v ) { return as_float4 (v ); }
207203
208204#define _DEFINE_VEC4_CAST (from_t , to_t ) \
209205 inline to_t##4 cast_##from_t##4_to_##to_t##4(from_t##4 from) { \
@@ -223,44 +219,30 @@ typedef float fp32;
223219 return cast_##pixelf_base_t##_to_##to_t(from); \
224220 }
225221
226- #define _DEFINE_OUT_PIXELF (pixelf_size , elem_t ) \
227- inline pixelf##pixelf_size out_pixelf##pixelf_size( \
228- struct out_##pixelf_size out) { \
229- return (pixelf##pixelf_size)(as_##elem_t(out.x), as_##elem_t(out.y), \
230- as_##elem_t(out.z), as_##elem_t(out.w)); \
231- }
232-
233222#define _DEFINE_READ_1D_PIXELF (pixelf_size , cuda_address_mode ) \
234223 pixelf##pixelf_size read_1d_##pixelf_size##_##cuda_address_mode(long image, \
235224 int x) { \
236- struct out_ ##pixelf_size res = \
225+ return as_pixelf ##pixelf_size( \
237226 __nvvm_suld_1d_v4i##pixelf_size##_##cuda_address_mode##_s( \
238- image, x * sizeof(struct out_##pixelf_size)); \
239- return out_pixelf##pixelf_size(res); \
227+ image, x * sizeof(pixelf##pixelf_size))); \
240228 }
241229
242230#define _DEFINE_READ_2D_PIXELF (pixelf_size , cuda_address_mode ) \
243231 pixelf##pixelf_size read_2d_##pixelf_size##_##cuda_address_mode( \
244232 long image, int x, int y) { \
245- struct out_ ##pixelf_size res = \
233+ return as_pixelf ##pixelf_size( \
246234 __nvvm_suld_2d_v4i##pixelf_size##_##cuda_address_mode##_s( \
247- image, x * sizeof(struct out_##pixelf_size), y); \
248- return out_pixelf##pixelf_size(res); \
235+ image, x * sizeof(pixelf##pixelf_size), y)); \
249236 }
250237
251238#define _DEFINE_READ_3D_PIXELF (pixelf_size , cuda_address_mode ) \
252239 pixelf##pixelf_size read_3d_##pixelf_size##_##cuda_address_mode( \
253240 long image, int x, int y, int z) { \
254- struct out_ ##pixelf_size res = \
241+ return as_pixelf ##pixelf_size( \
255242 __nvvm_suld_3d_v4i##pixelf_size##_##cuda_address_mode##_s( \
256- image, x * sizeof(struct out_##pixelf_size), y, z); \
257- return out_pixelf##pixelf_size(res); \
243+ image, x * sizeof(pixelf##pixelf_size), y, z)); \
258244 }
259245
260- _DEFINE_OUT_TYPE (float , 32 )
261- _DEFINE_OUT_TYPE (int , 32 )
262- _DEFINE_OUT_TYPE (uint , 32 )
263-
264246_DEFINE_VEC4_CAST (float , int )
265247_DEFINE_VEC4_CAST (int , float )
266248_DEFINE_VEC4_CAST (float , uint )
@@ -276,8 +258,6 @@ _DEFINE_CAST(pixelf32, float4)
276258_DEFINE_CAST (pixelf32 , pixelf32 )
277259_DEFINE_CAST (float4 , pixelf32 )
278260
279- _DEFINE_OUT_PIXELF (32 , float )
280-
281261_DEFINE_PIXELF_CAST (32 , float4 , int4 )
282262_DEFINE_PIXELF_CAST (32 , float4 , uint4 )
283263
@@ -298,8 +278,6 @@ _DEFINE_CAST(half4, half4)
298278_DEFINE_CAST (pixelf16 , half4 )
299279_DEFINE_CAST (pixelf16 , pixelf16 )
300280_DEFINE_CAST (half4 , pixelf16 )
301- _DEFINE_OUT_TYPE (half , 16 )
302- _DEFINE_OUT_PIXELF (16 , half )
303281_DEFINE_READ_1D_PIXELF (16 , trap )
304282_DEFINE_READ_2D_PIXELF (16 , trap )
305283_DEFINE_READ_3D_PIXELF (16 , trap )
@@ -311,11 +289,9 @@ _DEFINE_READ_2D_PIXELF(16, clamp)
311289_DEFINE_READ_3D_PIXELF (16 , clamp )
312290#endif
313291
314- #undef _DEFINE_OUT_TYPE
315292#undef _DEFINE_VEC4_CAST
316293#undef _DEFINE_VEC2_CAST
317294#undef _DEFINE_CAST
318- #undef _DEFINE_OUT_PIXELF
319295#undef _DEFINE_READ_1D_PIXELF
320296#undef _DEFINE_READ_2D_PIXELF
321297#undef _DEFINE_READ_3D_PIXELF
@@ -327,15 +303,15 @@ _DEFINE_READ_3D_PIXELF(16, clamp)
327303 _CLC_DEF \
328304 elem_t##4 _Z17__spirv_ImageReadIDv4_##elem_t_mangled##14ocl_image1d_roiET_T0_T1_( \
329305 read_only image1d_t image, int x) { \
330- return out_ ##elem_t( \
306+ return as_ ##elem_t##4( \
331307 __nvvm_suld_1d_v4i##elem_size##_clamp(image, x * sizeof(elem_t##4))); \
332308 }
333309
334310#define _CLC_DEFINE_IMAGE2D_READ_BUILTIN (elem_t , elem_t_mangled , elem_size ) \
335311 _CLC_DEF \
336312 elem_t##4 _Z17__spirv_ImageReadIDv4_##elem_t_mangled##14ocl_image2d_roDv2_iET_T0_T1_( \
337313 read_only image2d_t image, int2 coord) { \
338- return out_ ##elem_t(__nvvm_suld_2d_v4i##elem_size##_clamp( \
314+ return as_ ##elem_t##4 (__nvvm_suld_2d_v4i##elem_size##_clamp( \
339315 image, coord.x * sizeof(elem_t##4), coord.y)); \
340316 }
341317
@@ -344,7 +320,7 @@ _DEFINE_READ_3D_PIXELF(16, clamp)
344320 _CLC_DEF \
345321 elem_t##4 _Z17__spirv_ImageReadIDv4_##elem_t_mangled##14ocl_image3d_ro##coord_mangled##ET_T0_T1_( \
346322 read_only image3d_t image, int4 coord) { \
347- return out_ ##elem_t(__nvvm_suld_3d_v4i##elem_size##_clamp( \
323+ return as_ ##elem_t##4 (__nvvm_suld_3d_v4i##elem_size##_clamp( \
348324 image, coord.x * sizeof(elem_t##4), coord.y, coord.z)); \
349325 }
350326
@@ -463,7 +439,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
463439 float coord, long image, int sampler) { \
464440 if (is_nearest_filter_mode(sampler)) { \
465441 int i = (int)__spirv_ocl_floor(coord); \
466- return out_ ##elem_t( \
442+ return as_ ##elem_t##4( \
467443 __nvvm_suld_1d_v4i##elem_size##_##cuda_address_mode##_s( \
468444 image, i * sizeof(elem_t##4))); \
469445 } else { \
@@ -487,7 +463,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
487463 if (is_nearest_filter_mode(sampler)) { \
488464 int i = (int)__spirv_ocl_floor(coord.x); \
489465 int j = (int)__spirv_ocl_floor(coord.y); \
490- return out_ ##elem_t( \
466+ return as_ ##elem_t##4( \
491467 __nvvm_suld_2d_v4i##elem_size##_##cuda_address_mode##_s( \
492468 image, i * sizeof(elem_t##4), j)); \
493469 } else { \
@@ -520,7 +496,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
520496 int i = (int)__spirv_ocl_floor(coord.x); \
521497 int j = (int)__spirv_ocl_floor(coord.y); \
522498 int k = (int)__spirv_ocl_floor(coord.z); \
523- return out_ ##elem_t( \
499+ return as_ ##elem_t##4( \
524500 __nvvm_suld_3d_v4i##elem_size##_##cuda_address_mode##_s( \
525501 image, i * sizeof(elem_t##4), j, k)); \
526502 } else { \
@@ -570,7 +546,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
570546 if (i > width - 1) { \
571547 i = i - width; \
572548 } \
573- return out_ ##elem_t(__nvvm_suld_1d_v4i##elem_size##_trap_s( \
549+ return as_ ##elem_t##4 (__nvvm_suld_1d_v4i##elem_size##_trap_s( \
574550 image, i * sizeof(elem_t##4))); \
575551 } else { \
576552 int i0, i1; \
@@ -609,7 +585,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
609585 if (j > height - 1) { \
610586 j = j - height; \
611587 } \
612- return out_ ##elem_t(__nvvm_suld_2d_v4i##elem_size##_trap_s( \
588+ return as_ ##elem_t##4 (__nvvm_suld_2d_v4i##elem_size##_trap_s( \
613589 image, i * sizeof(elem_t##4), j)); \
614590 } else { \
615591 int i0, i1, j0, j1; \
@@ -666,7 +642,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
666642 if (k > depth - 1) { \
667643 k = k - depth; \
668644 } \
669- return out_ ##elem_t(__nvvm_suld_3d_v4i##elem_size##_trap_s( \
645+ return as_ ##elem_t##4 (__nvvm_suld_3d_v4i##elem_size##_trap_s( \
670646 image, i * sizeof(elem_t##4), j, k)); \
671647 } else { \
672648 int i0, i1, j0, j1, k0, k1; \
@@ -735,7 +711,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
735711 int i = (int)__spirv_ocl_floor(u); \
736712 i = __spirv_ocl_s_min(i, width - 1); \
737713 \
738- return out_ ##elem_t(__nvvm_suld_1d_v4i##elem_size##_trap_s( \
714+ return as_ ##elem_t##4 (__nvvm_suld_1d_v4i##elem_size##_trap_s( \
739715 image, i * sizeof(elem_t##4))); \
740716 } else { \
741717 int i0, i1; \
@@ -771,7 +747,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
771747 int j = (int)__spirv_ocl_floor(v); \
772748 j = __spirv_ocl_s_min(j, height - 1); \
773749 \
774- return out_ ##elem_t(__nvvm_suld_2d_v4i##elem_size##_trap_s( \
750+ return as_ ##elem_t##4 (__nvvm_suld_2d_v4i##elem_size##_trap_s( \
775751 image, i * sizeof(elem_t##4), j)); \
776752 } else { \
777753 int i0, i1, j0, j1; \
@@ -821,7 +797,7 @@ float4 unnormalized_coord_3d(float4 coord, long image) {
821797 int k = (int)__spirv_ocl_floor(w); \
822798 k = __spirv_ocl_s_min(k, depth - 1); \
823799 \
824- return out_ ##elem_t(__nvvm_suld_3d_v4i##elem_size##_trap_s( \
800+ return as_ ##elem_t##4 (__nvvm_suld_3d_v4i##elem_size##_trap_s( \
825801 image, i * sizeof(elem_t##4), j, k)); \
826802 } else { \
827803 int i0, i1, j0, j1, k0, k1; \
@@ -913,8 +889,7 @@ _DEFINE_SAMPLED_LOADS(half, 16)
913889 /* Sampling algorithms are implemented assu__spirv_ocl_s_ming an \
914890 * unnormalized floating point coordinate as input. Need to transform as \
915891 * appropriate. */ \
916- sampling_coord_t sampling_coord = \
917- cast_ ##input_coord_t ##_to_##sampling_coord_t(input_coord); \
892+ sampling_coord_t sampling_coord = as_ ##sampling_coord_t (input_coord); \
918893 if (is_normalized_coords(sampler)) { \
919894 sampling_coord = unnormalized_coord_##dims##d(sampling_coord, image); \
920895 } \
0 commit comments