From 12ad80fe2affbd2df59a2794c92e98d25454c254 Mon Sep 17 00:00:00 2001
From: Mariya Podchishchaeva <mariya.podchishchaeva@intel.com>
Date: Fri, 31 May 2019 08:44:10 +0300
Subject: [PATCH] [SYCL] Remove _Float16 from integration header

The main problem that half type is defined as _Float16 on the device and
as manually implemented type on host so compiler could add _Float16
to the integration header and it will produce errors in host
compilation.

Signed-off-by: Mariya Podchishchaeva <mariya.podchishchaeva@intel.com>
---
 sycl/include/CL/sycl/detail/kernel_desc.hpp | 14 ++++++++++++++
 sycl/test/sub_group/load_store.cpp          | 13 +++----------
 sycl/test/sub_group/shuffle.cpp             |  8 ++------
 3 files changed, 19 insertions(+), 16 deletions(-)
diff --git a/sycl/include/CL/sycl/detail/kernel_desc.hpp b/sycl/include/CL/sycl/detail/kernel_desc.hpp
index c887fea8d8446..f5cecb8dfaa2a 100644
--- a/sycl/include/CL/sycl/detail/kernel_desc.hpp
+++ b/sycl/include/CL/sycl/detail/kernel_desc.hpp
@@ -15,6 +15,20 @@ namespace cl {
 namespace sycl {
 namespace detail {
 
+namespace half_impl {
+
+class half;
+// Half type is defined as _Float16 on device and as manually implemented half
+// type on host. Integration header is generated by device compiler so it sees
+// half type as _Float16 and it will add _Float16 to integration header if it
+// is used in kernel name template parameters. To avoid errors in host
+// compilation we remove _Float16 from integration header using following macro.
+#ifndef __SYCL_DEVICE_ONLY__
+#define _Float16 cl::sycl::detail::half_impl::half
+#endif
+
+} // namespace half_impl
+
 // kernel parameter kinds
 enum class kernel_param_kind_t {
   kind_accessor,
diff --git a/sycl/test/sub_group/load_store.cpp b/sycl/test/sub_group/load_store.cpp
index e89677cacbf15..19cb8ef10183c 100644
--- a/sycl/test/sub_group/load_store.cpp
+++ b/sycl/test/sub_group/load_store.cpp
@@ -16,10 +16,6 @@
 template <typename T, int N> class sycl_subgr;
 
 using namespace cl::sycl;
-// TODO remove this workaround when integration header will support correct
-// half generation
-struct wa_half;
-typedef half aligned_half __attribute__((aligned(16)));
 
 template <typename T, int N> void check(queue &Queue) {
   const int G = 1024, L = 64;
@@ -34,12 +30,10 @@ template <typename T, int N> void check(queue &Queue) {
         acc[i] += 0.1; // Check that floating point types are not casted to int
       }
     }
-    using TT = typename std::conditional<std::is_same<T, aligned_half>::value,
-                                         wa_half, T>::type;
     Queue.submit([&](handler &cgh) {
       auto acc = syclbuf.template get_access<access::mode::read_write>(cgh);
       auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
-      cgh.parallel_for<sycl_subgr<TT, N>>(NdRange, [=](nd_item<1> NdItem) {
+      cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
         intel::sub_group SG = NdItem.get_sub_group();
         if (SG.get_group_id().get(0) % N == 0) {
           size_t WGSGoffset =
@@ -103,12 +97,10 @@ template <typename T> void check(queue &Queue) {
       }
     }
 
-    using TT = typename std::conditional<std::is_same<T, aligned_half>::value,
-                                         wa_half, T>::type;
     Queue.submit([&](handler &cgh) {
       auto acc = syclbuf.template get_access<access::mode::read_write>(cgh);
       auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
-      cgh.parallel_for<sycl_subgr<TT, 0>>(NdRange, [=](nd_item<1> NdItem) {
+      cgh.parallel_for<sycl_subgr<T, 0>>(NdRange, [=](nd_item<1> NdItem) {
         intel::sub_group SG = NdItem.get_sub_group();
         if (NdItem.get_global_id(0) == 0)
           sgsizeacc[0] = SG.get_max_local_range()[0];
@@ -180,6 +172,7 @@ int main() {
     check<aligned_short, 4>(Queue);
     check<aligned_short, 8>(Queue);
     if (Queue.get_device().has_extension("cl_khr_fp16")) {
+      typedef half aligned_half __attribute__((aligned(16)));
       check<aligned_half>(Queue);
       check<aligned_half, 1>(Queue);
       check<aligned_half, 2>(Queue);
diff --git a/sycl/test/sub_group/shuffle.cpp b/sycl/test/sub_group/shuffle.cpp
index 66b74767710e9..9e1437a4a3753 100644
--- a/sycl/test/sub_group/shuffle.cpp
+++ b/sycl/test/sub_group/shuffle.cpp
@@ -33,8 +33,6 @@ void check(queue &Queue, size_t G = 240, size_t L = 60) {
     buffer<vec<T, N>> buf_down(G);
     buffer<vec<T, N>> buf_xor(G);
     buffer<size_t> sgsizebuf(1);
-    using TT = typename std::conditional<std::is_same<T, half>::value, wa_half,
-                                         T>::type;
     Queue.submit([&](handler &cgh) {
       auto acc2 = buf2.template get_access<access::mode::read_write>(cgh);
       auto acc2_up = buf2_up.template get_access<access::mode::read_write>(cgh);
@@ -48,7 +46,7 @@ void check(queue &Queue, size_t G = 240, size_t L = 60) {
       auto acc_xor = buf_xor.template get_access<access::mode::read_write>(cgh);
       auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
 
-      cgh.parallel_for<sycl_subgr<TT, N>>(NdRange, [=](nd_item<1> NdItem) {
+      cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
         intel::sub_group SG = NdItem.get_sub_group();
         uint32_t wggid = NdItem.get_global_id(0);
         uint32_t sgid = SG.get_group_id().get(0);
@@ -141,8 +139,6 @@ template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
     buffer<T> buf_down(G);
     buffer<T> buf_xor(G);
     buffer<size_t> sgsizebuf(1);
-    using TT = typename std::conditional<std::is_same<T, half>::value, wa_half,
-                                         T>::type;
     Queue.submit([&](handler &cgh) {
       auto acc2 = buf2.template get_access<access::mode::read_write>(cgh);
       auto acc2_up = buf2_up.template get_access<access::mode::read_write>(cgh);
@@ -156,7 +152,7 @@ template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
       auto acc_xor = buf_xor.template get_access<access::mode::read_write>(cgh);
       auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
 
-      cgh.parallel_for<sycl_subgr<TT, 0>>(NdRange, [=](nd_item<1> NdItem) {
+      cgh.parallel_for<sycl_subgr<T, 0>>(NdRange, [=](nd_item<1> NdItem) {
         intel::sub_group SG = NdItem.get_sub_group();
         uint32_t wggid = NdItem.get_global_id(0);
         uint32_t sgid = SG.get_group_id().get(0);