diff --git a/SYCL/Basic/bit_cast/bit_cast.cpp b/SYCL/Basic/bit_cast/bit_cast.cpp
index d6b8ce6002..dfaf77ba1c 100644
--- a/SYCL/Basic/bit_cast/bit_cast.cpp
+++ b/SYCL/Basic/bit_cast/bit_cast.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl-device-code-split=per_kernel -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
@@ -13,11 +13,11 @@ constexpr sycl::access::mode sycl_write = sycl::access::mode::write;
 
 template <typename To, typename From> class BitCastKernel;
 
-template <typename To, typename From> To doBitCast(const From &ValueToConvert) {
+template <typename To, typename From>
+To doBitCast(sycl::queue Queue, const From &ValueToConvert) {
   std::vector<To> Vec(1);
   {
     sycl::buffer<To, 1> Buf(Vec.data(), 1);
-    sycl::queue Queue;
     Queue.submit([&](sycl::handler &cgh) {
       auto acc = Buf.template get_access<sycl_write>(cgh);
       cgh.single_task<class BitCastKernel<To, From>>([=]() {
@@ -28,8 +28,10 @@ template <typename To, typename From> To doBitCast(const From &ValueToConvert) {
   return Vec[0];
 }
 
-template <typename To, typename From> int test(const From &Value) {
-  auto ValueConvertedTwoTimes = doBitCast<From>(doBitCast<To>(Value));
+template <typename To, typename From>
+int test(sycl::queue Queue, const From &Value) {
+  auto ValueConvertedTwoTimes =
+      doBitCast<From>(Queue, doBitCast<To>(Queue, Value));
   bool isOriginalValueEqualsToConvertedTwoTimes = false;
   if (std::is_integral<From>::value) {
     isOriginalValueEqualsToConvertedTwoTimes = Value == ValueConvertedTwoTimes;
@@ -54,31 +56,34 @@ template <typename To, typename From> int test(const From &Value) {
 }
 
 int main() {
+  sycl::queue Queue;
   int ReturnCode = 0;
 
-  std::cout << "sycl::half to unsigned short ...\n";
-  ReturnCode += test<unsigned short>(sycl::half(1.0f));
+  if (Queue.get_device().has(sycl::aspect::fp16)) {
+    std::cout << "sycl::half to unsigned short ...\n";
+    ReturnCode += test<unsigned short>(Queue, sycl::half(1.0f));
 
-  std::cout << "unsigned short to sycl::half ...\n";
-  ReturnCode += test<sycl::half>(static_cast<unsigned short>(16384));
+    std::cout << "unsigned short to sycl::half ...\n";
+    ReturnCode += test<sycl::half>(Queue, static_cast<unsigned short>(16384));
 
-  std::cout << "sycl::half to short ...\n";
-  ReturnCode += test<short>(sycl::half(1.0f));
+    std::cout << "sycl::half to short ...\n";
+    ReturnCode += test<short>(Queue, sycl::half(1.0f));
 
-  std::cout << "short to sycl::half ...\n";
-  ReturnCode += test<sycl::half>(static_cast<short>(16384));
+    std::cout << "short to sycl::half ...\n";
+    ReturnCode += test<sycl::half>(Queue, static_cast<short>(16384));
+  }
 
   std::cout << "int to float ...\n";
-  ReturnCode += test<float>(static_cast<int>(2));
+  ReturnCode += test<float>(Queue, static_cast<int>(2));
 
   std::cout << "float to int ...\n";
-  ReturnCode += test<int>(static_cast<float>(-2.4f));
+  ReturnCode += test<int>(Queue, static_cast<float>(-2.4f));
 
   std::cout << "unsigned int to float ...\n";
-  ReturnCode += test<float>(static_cast<unsigned int>(6));
+  ReturnCode += test<float>(Queue, static_cast<unsigned int>(6));
 
   std::cout << "float to unsigned int ...\n";
-  ReturnCode += test<unsigned int>(static_cast<float>(-2.4f));
+  ReturnCode += test<unsigned int>(Queue, static_cast<float>(-2.4f));
 
   return ReturnCode;
 }
diff --git a/SYCL/Basic/built-ins.cpp b/SYCL/Basic/built-ins.cpp
index 06ffb9e240..e0f0be0d98 100644
--- a/SYCL/Basic/built-ins.cpp
+++ b/SYCL/Basic/built-ins.cpp
@@ -3,6 +3,11 @@
 // RUN: %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER
 // RUN: %ACC_RUN_PLACEHOLDER %t.out %ACC_CHECK_PLACEHOLDER
 
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -D__SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__ %s -o %t_nonvar.out
+// RUN: %CPU_RUN_PLACEHOLDER %t_nonvar.out %CPU_CHECK_PLACEHOLDER
+// RUN: %GPU_RUN_PLACEHOLDER %t_nonvar.out %GPU_CHECK_PLACEHOLDER
+// RUN: %ACC_RUN_PLACEHOLDER %t_nonvar.out %ACC_CHECK_PLACEHOLDER
+
 // CUDA does not support printf.
 // UNSUPPORTED: cuda
 //
@@ -28,15 +33,25 @@ static const CONSTANT char format[] = "Hello, World! %d %f\n";
 int main() {
   s::queue q{};
 
+#ifndef __SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__
+  if (!q.get_device().has(sycl::aspect::fp64)) {
+    std::cout
+        << "Test without __SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__ defined is "
+           "skipped because the device did not have fp64."
+        << std::endl;
+    return 0;
+  }
+#endif
+
   // Test printf
   q.submit([&](s::handler &CGH) {
      CGH.single_task<class printf>([=]() {
-       s::ext::oneapi::experimental::printf(format, 123, 1.23);
+       s::ext::oneapi::experimental::printf(format, 123, 1.23f);
        // CHECK: {{(Hello, World! 123 1.23)?}}
      });
    }).wait();
 
-  s::ext::oneapi::experimental::printf(format, 321, 3.21);
+  s::ext::oneapi::experimental::printf(format, 321, 3.21f);
   // CHECK: {{(Hello, World! 123 1.23)?}}
 
   // Test common
@@ -47,7 +62,7 @@ int main() {
       auto AccMin = BufMin.get_access<s::access::mode::write>(cgh);
       auto AccMax = BufMax.get_access<s::access::mode::write>(cgh);
       cgh.single_task<class common>([=]() {
-        AccMax[0] = s::max(s::cl_float2{0.5f, 2.5}, s::cl_float2{2.3f, 2.3});
+        AccMax[0] = s::max(s::cl_float2{0.5f, 2.5f}, s::cl_float2{2.3f, 2.3f});
         AccMin[0] = s::min(s::cl_float{0.5f}, s::cl_float{2.3f});
       });
     });
diff --git a/SYCL/Basic/half_builtins.cpp b/SYCL/Basic/half_builtins.cpp
index 1ef9cb2da9..34fab08c89 100644
--- a/SYCL/Basic/half_builtins.cpp
+++ b/SYCL/Basic/half_builtins.cpp
@@ -165,6 +165,14 @@ template <int N> bool check(vec<float, N> a, vec<float, N> b) {
 
 int main() {
   queue q;
+
+  if (!q.get_device().has(sycl::aspect::fp16)) {
+    std::cout
+        << "Test was skipped because the selected device does not support fp16"
+        << std::endl;
+    return 0;
+  }
+
   float16 a, b, c, d;
   for (int i = 0; i < SZ_max; i++) {
     a[i] = i / (float)SZ_max;
diff --git a/SYCL/Basic/scalar_vec_access.cpp b/SYCL/Basic/scalar_vec_access.cpp
index b945db3964..fc12c7bb86 100644
--- a/SYCL/Basic/scalar_vec_access.cpp
+++ b/SYCL/Basic/scalar_vec_access.cpp
@@ -29,7 +29,7 @@ int main() {
       // Test that it is possible to get a reference to single element of the
       // vector type. This behavior could possibly change in the future, this
       // test is necessary to track that.
-      float4_t my_float4 = {0.0, 1.0, 2.0, 3.0};
+      float4_t my_float4 = {0.0f, 1.0f, 2.0f, 3.0f};
       float f[4];
       for (int i = 0; i < 4; ++i) {
         f[i] = reinterpret_cast<float *>(&my_float4)[i];
@@ -40,14 +40,14 @@ int main() {
       }
 
       // Test that there is no template resolution error
-      sycl::float4 a = {1.0, 2.0, 3.0, 4.0};
+      sycl::float4 a = {1.0f, 2.0f, 3.0f, 4.0f};
       out << sycl::native::recip(a.x()) << sycl::endl;
     });
   });
   Q.wait();
 
   // Test that there is no ambiguity in overload resolution.
-  sycl::float4 a = {1.0, 2.0, 3.0, 4.0};
+  sycl::float4 a = {1.0f, 2.0f, 3.0f, 4.0f};
   std::cout << a.x() << std::endl;
 
   return 0;
diff --git a/SYCL/Basic/stream/stream.cpp b/SYCL/Basic/stream/stream.cpp
index ebdd3c9458..2c286a22a0 100644
--- a/SYCL/Basic/stream/stream.cpp
+++ b/SYCL/Basic/stream/stream.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER
 // RUN: %GPU_RUN_ON_LINUX_PLACEHOLDER %t.out %GPU_CHECK_ON_LINUX_PLACEHOLDER
 // RUN: %ACC_RUN_PLACEHOLDER %t.out %ACC_CHECK_PLACEHOLDER
@@ -97,26 +97,14 @@ int main() {
         // CHECK-NEXT: -12345678901245
         // CHECK-NEXT: 12345678901245
 
-        // Floating point types
+        // Floats
         Out << 33.4f << endl;
-        Out << 5.2 << endl;
         Out << -33.4f << endl;
-        Out << -5.2 << endl;
-        Out << 0.0003 << endl;
-        Out << -1.0 / 0.0 << endl;
-        Out << 1.0 / 0.0 << endl;
-        Out << sycl::sqrt(-1.0) << endl;
         Out << -1.0f / 0.0f << endl;
         Out << 1.0f / 0.0f << endl;
         Out << sycl::sqrt(-1.0f) << endl;
         // CHECK-NEXT: 33.4
-        // CHECK-NEXT: 5.2
         // CHECK-NEXT: -33.4
-        // CHECK-NEXT: -5.2
-        // CHECK-NEXT: 0.0003
-        // CHECK-NEXT: -inf
-        // CHECK-NEXT: inf
-        // CHECK-NEXT: nan
         // CHECK-NEXT: -inf
         // CHECK-NEXT: inf
         // CHECK-NEXT: nan
@@ -205,6 +193,33 @@ int main() {
     });
     Queue.wait();
 
+    if (Queue.get_device().has(sycl::aspect::fp64)) {
+      Queue.submit([&](handler &CGH) {
+        stream Out(1024, 80, CGH);
+        CGH.single_task<class doubles>([=]() {
+          // Double
+          Out << 5.2 << endl;
+          Out << -5.2 << endl;
+          Out << 0.0003 << endl;
+          Out << -1.0 / 0.0 << endl;
+          Out << 1.0 / 0.0 << endl;
+          Out << sycl::sqrt(-1.0) << endl;
+        });
+      });
+      Queue.wait();
+    } else {
+      // Repeat skipped message same number of times as the number of skipped
+      // output lines.
+      for (size_t I = 0; I < 6; ++I)
+        std::cout << "Skipped double test." << std::endl;
+    }
+    // CHECK-NEXT: {{(5.2|Skipped double test.)}}
+    // CHECK-NEXT: {{(-5.2|Skipped double test.)}}
+    // CHECK-NEXT: {{(0.0003|Skipped double test.)}}
+    // CHECK-NEXT: {{(-inf|Skipped double test.)}}
+    // CHECK-NEXT: {{(inf|Skipped double test.)}}
+    // CHECK-NEXT: {{(nan|Skipped double test.)}}
+
     // Stream in parallel_for
     Queue.submit([&](handler &CGH) {
       stream Out(1024, 80, CGH);
diff --git a/SYCL/Basic/vector_operators.cpp b/SYCL/Basic/vector_operators.cpp
index c52f7592b9..63a1af6dda 100644
--- a/SYCL/Basic/vector_operators.cpp
+++ b/SYCL/Basic/vector_operators.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl-device-code-split=per_kernel -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
@@ -38,6 +38,7 @@ template <typename T, int N> void check_vector_size() {
 }
 
 int main() {
+  s::queue Queue;
 
   /* Separate checks for NumElements=1 edge case */
 
@@ -46,7 +47,6 @@ int main() {
     vec_type res;
     {
       s::buffer<vec_type, 1> Buf(&res, s::range<1>(1));
-      s::queue Queue;
       Queue.submit([&](s::handler &cgh) {
         auto Acc = Buf.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isequal_vec_op_1_elem>([=]() {
@@ -67,7 +67,6 @@ int main() {
     vec_type res;
     {
       s::buffer<vec_type, 1> Buf(&res, s::range<1>(1));
-      s::queue Queue;
       Queue.submit([&](s::handler &cgh) {
         auto Acc = Buf.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isequal_vec_op_1_elem_scalar>([=]() {
@@ -94,7 +93,6 @@ int main() {
     res_vec_type res;
     {
       s::buffer<res_vec_type, 1> Buf(&res, s::range<1>(1));
-      s::queue Queue;
       Queue.submit([&](s::handler &cgh) {
         auto Acc = Buf.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isequal_vec_op>([=]() {
@@ -109,12 +107,11 @@ int main() {
   }
 
   // Operator <, cl_double
-  {
+  if (Queue.get_device().has(sycl::aspect::fp64)) {
     using res_vec_type = s::vec<s::cl_long, 4>;
     res_vec_type res;
     {
       s::buffer<res_vec_type, 1> Buf(&res, s::range<1>(1));
-      s::queue Queue;
       Queue.submit([&](s::handler &cgh) {
         auto Acc = Buf.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isless_vec_op>([=]() {
@@ -134,7 +131,6 @@ int main() {
     res_vec_type res;
     {
       s::buffer<res_vec_type, 1> Buf(&res, s::range<1>(1));
-      s::queue Queue;
       Queue.submit([&](s::handler &cgh) {
         auto Acc = Buf.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isgreater_vec_op>([=]() {
@@ -149,12 +145,11 @@ int main() {
   }
 
   // Operator <=, cl_half
-  {
+  if (Queue.get_device().has(sycl::aspect::fp16)) {
     using res_vec_type = s::vec<s::cl_short, 4>;
     res_vec_type res;
     {
       s::buffer<res_vec_type, 1> Buf(&res, s::range<1>(1));
-      s::queue Queue;
       Queue.submit([&](s::handler &cgh) {
         auto Acc = Buf.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isnotgreater_vec_op>([=]() {
@@ -176,7 +171,6 @@ int main() {
     res_vec_type res;
     {
       s::buffer<res_vec_type, 1> Buf(&res, s::range<1>(1));
-      s::queue Queue;
       Queue.submit([&](s::handler &cgh) {
         auto Acc = Buf.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isnotless_vec_op>([=]() {
@@ -196,7 +190,6 @@ int main() {
     res_vec_type res;
     {
       s::buffer<res_vec_type, 1> Buf(&res, s::range<1>(1));
-      s::queue Queue;
       Queue.submit([&](s::handler &cgh) {
         auto Acc = Buf.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isnotequal_vec_op>([=]() {
@@ -216,7 +209,6 @@ int main() {
     res_vec_type res;
     {
       s::buffer<res_vec_type, 1> Buf(&res, s::range<1>(1));
-      s::queue Queue;
       Queue.submit([&](s::handler &cgh) {
         auto Acc = Buf.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class logical_and_vec_op>([=]() {
@@ -236,7 +228,6 @@ int main() {
     res_vec_type res;
     {
       s::buffer<res_vec_type, 1> Buf(&res, s::range<1>(1));
-      s::queue Queue;
       Queue.submit([&](s::handler &cgh) {
         auto Acc = Buf.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class logical_or_vec_op>([=]() {
@@ -257,7 +248,6 @@ int main() {
     res_vec_type res;
     {
       s::buffer<res_vec_type, 1> Buf(&res, s::range<1>(1));
-      s::queue Queue;
       Queue.submit([&](s::handler &cgh) {
         auto Acc = Buf.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class as_op>([=]() {
diff --git a/SYCL/Complex/sycl_complex_stream_test.cpp b/SYCL/Complex/sycl_complex_stream_test.cpp
index 7878f55984..9aa10c6db6 100644
--- a/SYCL/Complex/sycl_complex_stream_test.cpp
+++ b/SYCL/Complex/sycl_complex_stream_test.cpp
@@ -55,27 +55,37 @@ template <typename T> struct test_istream_operator {
   }
 };
 
-int main() {
-  sycl::queue Q;
-
+template <typename T> bool test_common(sycl::queue Q) {
   bool test_passes = true;
-
-  test_passes &=
-      test_valid_types<test_sycl_stream_operator>(Q, cmplx(1.5, -1.0));
   test_passes &=
-      test_valid_types<test_sycl_stream_operator>(Q, cmplx(INFINITY, INFINITY));
+      test_valid_types<test_sycl_stream_operator>(Q, cmplx<T>(1.5, -1.0));
+  test_passes &= test_valid_types<test_sycl_stream_operator>(
+      Q, cmplx<T>(INFINITY, INFINITY));
   test_passes &=
-      test_valid_types<test_sycl_stream_operator>(Q, cmplx(NAN, NAN));
+      test_valid_types<test_sycl_stream_operator>(Q, cmplx<T>(NAN, NAN));
 
-  test_passes &= test_valid_types<test_ostream_operator>(cmplx(1.5, -1.0));
+  test_passes &= test_valid_types<test_ostream_operator>(cmplx<T>(1.5, -1.0));
   test_passes &=
-      test_valid_types<test_ostream_operator>(cmplx(INFINITY, INFINITY));
-  test_passes &= test_valid_types<test_ostream_operator>(cmplx(NAN, NAN));
+      test_valid_types<test_ostream_operator>(cmplx<T>(INFINITY, INFINITY));
+  test_passes &= test_valid_types<test_ostream_operator>(cmplx<T>(NAN, NAN));
 
-  test_passes &= test_valid_types<test_istream_operator>(cmplx(1.5, -1.0));
+  test_passes &= test_valid_types<test_istream_operator>(cmplx<T>(1.5, -1.0));
   test_passes &=
-      test_valid_types<test_istream_operator>(cmplx(INFINITY, INFINITY));
-  test_passes &= test_valid_types<test_istream_operator>(cmplx(NAN, NAN));
+      test_valid_types<test_istream_operator>(cmplx<T>(INFINITY, INFINITY));
+  test_passes &= test_valid_types<test_istream_operator>(cmplx<T>(NAN, NAN));
+  return test_passes;
+}
+
+int main() {
+  sycl::queue Q;
+
+  bool test_passes = true;
+
+  test_passes &= test_common<float>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    test_passes &= test_common<sycl::half>(Q);
+  if (Q.get_device().has(sycl::aspect::fp64))
+    test_passes &= test_common<double>(Q);
 
   if (!test_passes)
     std::cerr << "Stream operator with complex test fails\n";
diff --git a/SYCL/DeviceLib/built-ins/printf.cpp b/SYCL/DeviceLib/built-ins/printf.cpp
index 5afc03f565..36aee3b8f2 100644
--- a/SYCL/DeviceLib/built-ins/printf.cpp
+++ b/SYCL/DeviceLib/built-ins/printf.cpp
@@ -2,10 +2,15 @@
 // HIP doesn't support printf.
 // CUDA doesn't support vector format specifiers ("%v").
 //
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER
 // RUN: %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER
 // RUN: %ACC_RUN_PLACEHOLDER %t.out %ACC_CHECK_PLACEHOLDER
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -D__SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__ %s -o %t_nonvar.out
+// RUN: %CPU_RUN_PLACEHOLDER %t_nonvar.out %CPU_CHECK_PLACEHOLDER
+// RUN: %GPU_RUN_PLACEHOLDER %t_nonvar.out %GPU_CHECK_PLACEHOLDER
+// RUN: %ACC_RUN_PLACEHOLDER %t_nonvar.out %ACC_CHECK_PLACEHOLDER
 
 #include <sycl/sycl.hpp>
 
@@ -34,9 +39,8 @@ static const CONSTANT char format_vec[] = "%d,%d,%d,%d\n";
 const CONSTANT char format_hello_world_2[] = "%lu: Hello, World!\n";
 
 int main() {
+  queue Queue(default_selector_v);
   {
-    queue Queue(default_selector_v);
-
     Queue.submit([&](handler &CGH) {
       CGH.single_task<class integral>([=]() {
         // String
@@ -54,17 +58,6 @@ int main() {
         // CHECK: 123
         // CHECK-NEXT: -123
 
-        // Floating point types
-        {
-          // You can declare format string in non-global scope, but in this case
-          // static keyword is required
-          static const CONSTANT char format[] = "%.1f\n";
-          ext::oneapi::experimental::printf(format, 33.4f);
-          ext::oneapi::experimental::printf(format, -33.4f);
-        }
-        // CHECK-NEXT: 33.4
-        // CHECK-NEXT: -33.4
-
         // Vectors
         sycl::vec<int, 4> v4{5, 6, 7, 8};
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__SPIR__)
@@ -107,8 +100,38 @@ int main() {
     Queue.wait();
   }
 
+#ifndef __SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__
+  // Currently printf will promote floating point values to doubles.
+  // __SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__ changes the behavior to not use
+  // a variadic function, so if it is defined it will not promote the floating
+  // point arguments.
+  if (Queue.get_device().has(sycl::aspect::fp64))
+#endif // __SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__
+  {
+    Queue.submit([&](handler &CGH) {
+      CGH.single_task<class floating_points>([=]() {
+        // Floating point types
+        {
+          // You can declare format string in non-global scope, but in this case
+          // static keyword is required
+          static const CONSTANT char format[] = "%.1f\n";
+          ext::oneapi::experimental::printf(format, 33.4f);
+          ext::oneapi::experimental::printf(format, -33.4f);
+        }
+      });
+    });
+    Queue.wait();
+  }
+#ifndef __SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__
+  else {
+    std::cout << "Skipped floating point test." << std::endl;
+    std::cout << "Skipped floating point test." << std::endl;
+  }
+#endif // __SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__
+  // CHECK-NEXT: {{(33.4|Skipped floating point test.)}}
+  // CHECK-NEXT: {{(-33.4|Skipped floating point test.)}}
+
   {
-    queue Queue(default_selector_v);
     // printf in parallel_for
     Queue.submit([&](handler &CGH) {
       CGH.parallel_for<class stream_string>(range<1>(10), [=](id<1> i) {
diff --git a/SYCL/DeviceLib/built-ins/scalar_relational.cpp b/SYCL/DeviceLib/built-ins/scalar_relational.cpp
index 3e920f70c6..c9af02b703 100644
--- a/SYCL/DeviceLib/built-ins/scalar_relational.cpp
+++ b/SYCL/DeviceLib/built-ins/scalar_relational.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl-device-code-split=per_kernel -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
@@ -11,12 +11,13 @@
 namespace s = sycl;
 
 int main() {
+  s::queue myQueue;
+
   // isequal-float
   {
     s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isequalF1F1>([=]() {
@@ -32,7 +33,6 @@ int main() {
     s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isnotequalF1F1>([=]() {
@@ -48,7 +48,6 @@ int main() {
     s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isgreaterF1F1>([=]() {
@@ -64,7 +63,6 @@ int main() {
     s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isgreaterequalF1F1>([=]() {
@@ -80,7 +78,6 @@ int main() {
     s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class islessF1F1>([=]() {
@@ -96,7 +93,6 @@ int main() {
     s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class islessequalF1F1>([=]() {
@@ -112,7 +108,6 @@ int main() {
     s::cl_int r{1};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class islessgreaterF1F1>([=]() {
@@ -128,7 +123,6 @@ int main() {
     s::cl_int r{1};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isfiniteF1>(
@@ -143,7 +137,6 @@ int main() {
     s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isinfF1>(
@@ -158,7 +151,6 @@ int main() {
     s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isnanF1>(
@@ -173,7 +165,6 @@ int main() {
     s::cl_int r{1};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isnormalF1>(
@@ -184,11 +175,10 @@ int main() {
   }
 
   // isnormal-double
-  {
+  if (myQueue.get_device().has(sycl::aspect::fp64)) {
     s::cl_int r{1};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isnormalD1>(
@@ -203,7 +193,6 @@ int main() {
     s::cl_int r{1};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isorderedF1F1>([=]() {
@@ -219,7 +208,6 @@ int main() {
     s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class isunorderedF1F1>([=]() {
@@ -235,7 +223,6 @@ int main() {
     s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class signbitF1>(
@@ -250,7 +237,6 @@ int main() {
     s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class anyF1positive>(
@@ -264,7 +250,6 @@ int main() {
     s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class anyF1zero>(
@@ -279,7 +264,6 @@ int main() {
     s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class anyF1negative>(
@@ -294,7 +278,6 @@ int main() {
     s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class allF1positive>(
@@ -309,7 +292,6 @@ int main() {
     s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class allF1zero>(
@@ -324,7 +306,6 @@ int main() {
     s::cl_int r{0};
     {
       s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class allF1negative>(
@@ -339,7 +320,6 @@ int main() {
     s::cl_float r{0.0f};
     {
       s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class bitselectF1F1F1>([=]() {
@@ -356,7 +336,6 @@ int main() {
     s::cl_float r{0};
     {
       s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class selectF1F1I1positive>([=]() {
@@ -373,7 +352,6 @@ int main() {
     s::cl_float r{0};
     {
       s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class selectF1F1I1zero>([=]() {
@@ -390,7 +368,6 @@ int main() {
     s::cl_float r{0};
     {
       s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class selectF1F1I1negative>([=]() {
diff --git a/SYCL/DeviceLib/built-ins/vector_geometric.cpp b/SYCL/DeviceLib/built-ins/vector_geometric.cpp
index 4ee7735d5b..fc939e3cbb 100644
--- a/SYCL/DeviceLib/built-ins/vector_geometric.cpp
+++ b/SYCL/DeviceLib/built-ins/vector_geometric.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl-device-code-split=per_kernel -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
@@ -15,12 +15,13 @@ template <typename T> bool isEqualTo(T x, T y, T epsilon = 0.005) {
 }
 
 int main() {
+  s::queue myQueue;
+
   // dot
   {
     s::cl_float r{0};
     {
       s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class dotF2F2>([=]() {
@@ -41,7 +42,6 @@ int main() {
     s::cl_float4 r{0};
     {
       s::buffer<s::cl_float4, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class crossF4>([=]() {
@@ -74,11 +74,10 @@ int main() {
   }
 
   // cross (double)
-  {
+  if (myQueue.get_device().has(sycl::aspect::fp64)) {
     s::cl_double4 r{0};
     {
       s::buffer<s::cl_double4, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class crossD4>([=]() {
@@ -115,7 +114,6 @@ int main() {
     s::cl_float r{0};
     {
       s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class distanceF2>([=]() {
@@ -139,7 +137,6 @@ int main() {
     s::cl_float r{0};
     {
       s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class lengthF2>([=]() {
@@ -158,7 +155,6 @@ int main() {
     s::cl_float2 r{0};
     {
       s::buffer<s::cl_float2, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class normalizeF2>([=]() {
@@ -181,7 +177,6 @@ int main() {
     s::cl_float r{0};
     {
       s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class fast_distanceF2>([=]() {
@@ -205,7 +200,6 @@ int main() {
     s::cl_float r{0};
     {
       s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class fast_lengthF2>([=]() {
@@ -224,7 +218,6 @@ int main() {
     s::cl_float2 r{0};
     {
       s::buffer<s::cl_float2, 1> BufR(&r, s::range<1>(1));
-      s::queue myQueue;
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class fast_normalizeF2>([=]() {
diff --git a/SYCL/DeviceLib/built-ins/vector_relational.cpp b/SYCL/DeviceLib/built-ins/vector_relational.cpp
index fc7571b1ae..7c6d20ba20 100644
--- a/SYCL/DeviceLib/built-ins/vector_relational.cpp
+++ b/SYCL/DeviceLib/built-ins/vector_relational.cpp
@@ -522,9 +522,9 @@ int main() {
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class bitselectF4F4F4>([=]() {
-          AccR[0] = s::bitselect(s::cl_float4{112.112, 12.12, 0, 0.0},
-                                 s::cl_float4{34.34, 23.23, 1, 0.0},
-                                 s::cl_float4{3.3, 6.6, 1, 0.0});
+          AccR[0] = s::bitselect(s::cl_float4{112.112f, 12.12f, 0, 0.0f},
+                                 s::cl_float4{34.34f, 23.23f, 1, 0.0f},
+                                 s::cl_float4{3.3f, 6.6f, 1, 0.0f});
         }); // Using NAN/INFINITY as any float produced consistent results
             // between host and device.
       });
diff --git a/SYCL/DeviceLib/std_complex_math_test.cpp b/SYCL/DeviceLib/std_complex_math_test.cpp
index f130190e15..5adcd2ef0a 100644
--- a/SYCL/DeviceLib/std_complex_math_test.cpp
+++ b/SYCL/DeviceLib/std_complex_math_test.cpp
@@ -196,6 +196,18 @@ void device_complex_test_2(s::queue &deviceQueue) {
 #endif
 int main() {
   s::queue deviceQueue;
+
+#ifdef _WIN32
+  // std::complex math on Windows uses doubles internally so fp64 is required to
+  // run this test.
+  if (!deviceQueue.get_device().has(s::aspect::fp64)) {
+    std::cout << "Skipping test as device does not support fp64 which is "
+                 "required for math operations on std::complex on Windows."
+              << std::endl;
+    return 0;
+  }
+#endif
+
   device_complex_test_1(deviceQueue);
 #ifndef _WIN32
   device_complex_test_2(deviceQueue);
diff --git a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp
index 755dc2566e..d98eb0160d 100644
--- a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp
+++ b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp
@@ -264,27 +264,35 @@ int main(void) {
   bool passed = true;
   using BinOp = esimd_test::BinaryOp;
 
+  bool SupportsDouble = dev.has(aspect::fp64);
+  bool SupportsHalf = dev.has(aspect::fp16);
+
   auto arith_ops = esimd_test::ArithBinaryOpsNoDiv;
   passed &= test<unsigned char, int, 1, BinOp, VSf, IDf>(arith_ops, q);
   passed &= test<char, float, 7, BinOp, VEf, IDf>(arith_ops, q, 0.000001f);
-  if (dev.has(aspect::fp64))
+  if (SupportsDouble)
     passed &= test<short, double, 7, BinOp, VEf, IDf>(arith_ops, q, 1e-15);
   passed &= test<float, float, 32, BinOp, VEf, IDf>(arith_ops, q, 0.000001f);
-  passed &= test<half, char, 1, BinOp, verify_n, IDf>(arith_ops, q, 1);
-  passed &= test<half, unsigned int, 32, BinOp, VSf, IDf>(arith_ops, q, 1);
-  if (dev.has(aspect::fp64))
+  if (SupportsHalf)
+    passed &= test<half, char, 1, BinOp, verify_n, IDf>(arith_ops, q, 1);
+  if (SupportsHalf)
+    passed &= test<half, unsigned int, 32, BinOp, VSf, IDf>(arith_ops, q, 1);
+  if (SupportsDouble && SupportsHalf)
     passed &= test<double, half, 7, BinOp, VSf, IDf>(arith_ops, q);
   passed &= test<short, uint64_t, 7, BinOp, VSf, IDf>(arith_ops, q);
 #ifdef USE_BF16
   passed &= test<bfloat16, int, 8, BinOp, VSf, IDf>(arith_ops, q);
-  passed &= test<half, bfloat16, 7, BinOp, VEfa, IDf>(arith_ops, q, 0.03);
+  if (SupportsHalf)
+    passed &= test<half, bfloat16, 7, BinOp, VEfa, IDf>(arith_ops, q, 0.03);
 #endif // USE_BF16
 #ifdef USE_TF32
   passed &= test<tfloat32, float, 32, BinOp, VEf, IDf>(arith_ops, q, 0.000001f);
   passed &=
       test<tfloat32, tfloat32, 32, BinOp, VEf, IDf>(arith_ops, q, 0.000001f);
   passed &= test<char, tfloat32, 32, BinOp, VEf, IDf>(arith_ops, q, 0.000001f);
-  passed &= test<tfloat32, half, 32, BinOp, VEf, IDf>(arith_ops, q, 0.000001f);
+  if (SupportsHalf)
+    passed &=
+        test<tfloat32, half, 32, BinOp, VEf, IDf>(arith_ops, q, 0.000001f);
   passed &= test<tfloat32, unsigned char, 32, BinOp, VEf, IDf>(arith_ops, q,
                                                                0.000001f);
   passed &= test<tfloat32, short, 32, BinOp, VEf, IDf>(arith_ops, q, 0.000001f);
@@ -294,27 +302,31 @@ int main(void) {
   passed &= test<unsigned char, int, 1, BinOp, VSf, IDf>(div_op, q);
   passed &= test<char, float, 7, BinOp, VEf, IDf>(div_op, q, 0.000001f);
 #ifndef WA_BUG
-  if (dev.has(aspect::fp64))
+  if (SupportsDouble)
     passed &= test<short, double, 7, BinOp, VSf, IDf>(div_op, q);
 #endif // WA_BUG
   passed &= test<float, float, 32, BinOp, VEf, IDf>(div_op, q, 0.000001f);
-  passed &= test<half, char, 1, BinOp, verify_n, IDf>(div_op, q, 1);
-  passed &= test<half, unsigned int, 32, BinOp, VSf, IDf>(div_op, q, 1);
+  if (SupportsHalf)
+    passed &= test<half, char, 1, BinOp, verify_n, IDf>(div_op, q, 1);
+  if (SupportsHalf)
+    passed &= test<half, unsigned int, 32, BinOp, VSf, IDf>(div_op, q, 1);
 #ifndef WA_BUG
-  if (dev.has(aspect::fp64))
+  if (SupportsDouble && SupportsHalf)
     passed &= test<double, half, 7, BinOp, VSf, IDf>(div_op, q);
 #endif // WA_BUG
   passed &= test<short, uint64_t, 7, BinOp, VSf, IDf>(div_op, q);
 #ifdef USE_BF16
   passed &= test<bfloat16, short, 8, BinOp, VSf, IDf>(div_op, q);
-  passed &= test<half, bfloat16, 7, BinOp, VEfa, IDf>(div_op, q, 0.03);
+  if (SupportsHalf)
+    passed &= test<half, bfloat16, 7, BinOp, VEfa, IDf>(div_op, q, 0.03);
 #endif // USE_BF16
 
 #ifdef USE_TF32
   passed &= test<tfloat32, float, 32, BinOp, VEf, IDf>(div_op, q, 0.000001f);
   passed &= test<tfloat32, tfloat32, 32, BinOp, VEf, IDf>(div_op, q, 0.000001f);
   passed &= test<char, tfloat32, 32, BinOp, VEf, IDf>(div_op, q, 0.000001f);
-  passed &= test<tfloat32, half, 32, BinOp, VEf, IDf>(div_op, q, 0.000001f);
+  if (SupportsHalf)
+    passed &= test<tfloat32, half, 32, BinOp, VEf, IDf>(div_op, q, 0.000001f);
   passed &=
       test<tfloat32, unsigned char, 32, BinOp, VEf, IDf>(div_op, q, 0.000001f);
   passed &= test<tfloat32, short, 32, BinOp, VEf, IDf>(div_op, q, 0.000001f);
@@ -351,24 +363,28 @@ int main(void) {
   auto cmp_ops = esimd_test::CmpOps;
   passed &= test<unsigned char, int, 1, CmpOp, VSf, IDf>(cmp_ops, q);
   passed &= test<char, float, 7, CmpOp, VSf, IDf>(cmp_ops, q);
-  if (dev.has(aspect::fp64))
+  if (SupportsDouble)
     passed &= test<short, double, 7, CmpOp, VSf, IDf>(cmp_ops, q);
   passed &= test<float, float, 32, CmpOp, VSf, IDf>(cmp_ops, q);
-  passed &= test<half, char, 1, CmpOp, VSf, IDf>(cmp_ops, q, 1);
-  passed &= test<half, unsigned int, 32, CmpOp, VSf, IDf>(cmp_ops, q, 1);
-  if (dev.has(aspect::fp64))
+  if (SupportsHalf)
+    passed &= test<half, char, 1, CmpOp, VSf, IDf>(cmp_ops, q, 1);
+  if (SupportsHalf)
+    passed &= test<half, unsigned int, 32, CmpOp, VSf, IDf>(cmp_ops, q, 1);
+  if (SupportsDouble)
     passed &= test<double, half, 7, CmpOp, VSf, IDf>(cmp_ops, q);
   passed &= test<short, uint64_t, 7, CmpOp, VSf, IDf>(cmp_ops, q);
 #ifdef USE_BF16
   passed &= test<bfloat16, int, 32, CmpOp, VSf, IDf>(cmp_ops, q);
-  passed &= test<half, bfloat16, 7, CmpOp, VSf, IDf>(cmp_ops, q);
+  if (SupportsHalf)
+    passed &= test<half, bfloat16, 7, CmpOp, VSf, IDf>(cmp_ops, q);
 #endif // USE_BF16
 
 #ifdef USE_TF32
   passed &= test<tfloat32, float, 32, CmpOp, VSf, IDf>(cmp_ops, q);
   passed &= test<tfloat32, tfloat32, 32, CmpOp, VSf, IDf>(cmp_ops, q);
   passed &= test<char, tfloat32, 32, CmpOp, VSf, IDf>(cmp_ops, q);
-  passed &= test<tfloat32, half, 32, CmpOp, VSf, IDf>(cmp_ops, q);
+  if (SupportsHalf)
+    passed &= test<tfloat32, half, 32, CmpOp, VSf, IDf>(cmp_ops, q);
   passed &= test<tfloat32, unsigned char, 32, CmpOp, VSf, IDf>(cmp_ops, q);
   passed &= test<tfloat32, short, 32, CmpOp, VSf, IDf>(cmp_ops, q);
 #endif // USE_TF32
diff --git a/SYCL/ESIMD/api/saturation_smoke.cpp b/SYCL/ESIMD/api/saturation_smoke.cpp
index 005cb1497e..3d2ab818d4 100644
--- a/SYCL/ESIMD/api/saturation_smoke.cpp
+++ b/SYCL/ESIMD/api/saturation_smoke.cpp
@@ -185,10 +185,13 @@ int main(int argc, char **argv) {
   std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
             << "\n";
   const bool doublesSupported = dev.has(sycl::aspect::fp64);
+  const bool halfsSupported = dev.has(sycl::aspect::fp16);
 
   bool passed = true;
-  passed &= test<half, int, FpToInt>(q);
-  passed &= test<half, unsigned char, FpToInt>(q);
+  if (halfsSupported)
+    passed &= test<half, int, FpToInt>(q);
+  if (halfsSupported)
+    passed &= test<half, unsigned char, FpToInt>(q);
   passed &= test<float, int, FpToInt>(q);
   if (doublesSupported)
     passed &= test<double, short, FpToInt>(q);
@@ -207,7 +210,8 @@ int main(int argc, char **argv) {
   passed &= test<int, unsigned char, SIntToNarrowAnyInt>(q);
 
   passed &= test<float, float, FpToFp>(q);
-  passed &= test<half, half, FpToFp>(q);
+  if (halfsSupported)
+    passed &= test<half, half, FpToFp>(q);
   if (doublesSupported)
     passed &= test<double, double, FpToFp>(q);
 
diff --git a/SYCL/ESIMD/api/simd_memory_access.cpp b/SYCL/ESIMD/api/simd_memory_access.cpp
index 4912d52ca9..2e637c7f2f 100644
--- a/SYCL/ESIMD/api/simd_memory_access.cpp
+++ b/SYCL/ESIMD/api/simd_memory_access.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 // REQUIRES: gpu
 // UNSUPPORTED: cuda || hip
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl-device-code-split=per_kernel -fsycl %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 //
 // The test checks functionality of the memory access APIs which are members of
@@ -155,6 +155,7 @@ int main(int argc, char **argv) {
   auto dev = q.get_device();
   std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
             << "\n";
+  const bool halfsSupported = dev.has(sycl::aspect::fp16);
 
   bool passed = true;
   passed &= test<char, 32, true>(q, size);
@@ -165,7 +166,8 @@ int main(int argc, char **argv) {
   passed &= test<int, 32, true>(q, size);
   passed &= test<unsigned int, 32, true>(q, size);
   passed &= test<float, 32, true>(q, size);
-  passed &= test<half, 32, true>(q, size);
+  if (halfsSupported)
+    passed &= test<half, 32, true>(q, size);
 
   passed &= test<char, 32, false>(q, size);
   passed &= test<unsigned char, 16, false>(q, size);
@@ -175,7 +177,8 @@ int main(int argc, char **argv) {
   passed &= test<int, 32, false>(q, size);
   passed &= test<unsigned int, 32, false>(q, size);
   passed &= test<float, 32, false>(q, size);
-  passed &= test<half, 32, false>(q, size);
+  if (halfsSupported)
+    passed &= test<half, 32, false>(q, size);
 
   std::cout << (passed ? "=== Test passed\n" : "=== Test FAILED\n");
   return passed ? 0 : 1;
diff --git a/SYCL/ESIMD/api/simd_view_copy_move_assign.cpp b/SYCL/ESIMD/api/simd_view_copy_move_assign.cpp
index 811605fb65..11ea98577a 100644
--- a/SYCL/ESIMD/api/simd_view_copy_move_assign.cpp
+++ b/SYCL/ESIMD/api/simd_view_copy_move_assign.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 // REQUIRES: gpu
 // UNSUPPORTED: cuda || hip
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl-device-code-split=per_kernel -fsycl %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // This test checks the behavior of simd_view constructors
@@ -164,7 +164,8 @@ int main(void) {
   bool passed = true;
   passed &= testT<char>(q);
   passed &= testT<float>(q);
-  passed &= testT<half>(q);
+  if (dev.has(sycl::aspect::fp16))
+    passed &= testT<half>(q);
 
   return passed ? 0 : 1;
 }
diff --git a/SYCL/ESIMD/api/simd_view_select_2d_fp.cpp b/SYCL/ESIMD/api/simd_view_select_2d_fp.cpp
index b2d326b690..3c8e53d032 100644
--- a/SYCL/ESIMD/api/simd_view_select_2d_fp.cpp
+++ b/SYCL/ESIMD/api/simd_view_select_2d_fp.cpp
@@ -22,7 +22,8 @@ int main(int argc, char **argv) {
             << "\n";
 
   bool passed = true;
-  passed &= test<half>(q);
+  if (dev.has(sycl::aspect::fp16))
+    passed &= test<half>(q);
   passed &= test<float>(q);
   if (dev.has(sycl::aspect::fp64))
     passed &= test<double>(q);
diff --git a/SYCL/ESIMD/api/slm_gather_scatter.cpp b/SYCL/ESIMD/api/slm_gather_scatter.cpp
index 5e3f1960d1..ceefb08f55 100644
--- a/SYCL/ESIMD/api/slm_gather_scatter.cpp
+++ b/SYCL/ESIMD/api/slm_gather_scatter.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: gpu
 // UNSUPPORTED: cuda || hip
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl-device-code-split=per_kernel -fsycl %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 //
 // The test checks functionality of the slm_gather/slm_scatter ESIMD APIs.
@@ -139,8 +139,10 @@ int main(void) {
   passed &= test<float, 16>(q);
   passed &= test<float, 32>(q);
 
-  passed &= test<half, 16>(q);
-  passed &= test<half, 32>(q);
+  if (dev.has(aspect::fp16)) {
+    passed &= test<half, 16>(q);
+    passed &= test<half, 32>(q);
+  }
 
   return passed ? 0 : 1;
 }
diff --git a/SYCL/ESIMD/api/slm_gather_scatter_heavy.cpp b/SYCL/ESIMD/api/slm_gather_scatter_heavy.cpp
index 3935790500..76451e6d30 100644
--- a/SYCL/ESIMD/api/slm_gather_scatter_heavy.cpp
+++ b/SYCL/ESIMD/api/slm_gather_scatter_heavy.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 // REQUIRES: gpu
 // UNSUPPORTED: cuda || hip
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl-device-code-split=per_kernel -fsycl %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 //
 // The test checks functionality of the slm gather/scatter ESIMD intrinsics.
@@ -463,8 +463,10 @@ int main(int argc, char **argv) {
   passed &= test<float, 16, 5>(q);
   passed &= test<float, 32, 3>(q);
   passed &= test_vl1<float, 7>(q);
-  passed &= test_vl1<half, 7>(q);
-  passed &= test<half, 16, 2>(q);
+  if (dev.has(aspect::fp16)) {
+    passed &= test_vl1<half, 7>(q);
+    passed &= test<half, 16, 2>(q);
+  }
 
   std::cout << (!passed ? "TEST FAILED\n" : "TEST Passed\n");
   return passed ? 0 : 1;
diff --git a/SYCL/ESIMD/api/svm_gather_scatter.cpp b/SYCL/ESIMD/api/svm_gather_scatter.cpp
index 7d149b3fc0..09cf3e9d30 100644
--- a/SYCL/ESIMD/api/svm_gather_scatter.cpp
+++ b/SYCL/ESIMD/api/svm_gather_scatter.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 // REQUIRES: gpu && !gpu-intel-pvc
 // UNSUPPORTED: cuda || hip
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl-device-code-split=per_kernel -fsycl %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // Regression test for SVM gather/scatter API.
@@ -104,12 +104,14 @@ int main(void) {
   Pass &= test<int32_t, 16>(Q);
   Pass &= test<int32_t, 32>(Q);
 
-  Pass &= test<half, 1>(Q);
-  Pass &= test<half, 2>(Q);
-  Pass &= test<half, 4>(Q);
-  Pass &= test<half, 8>(Q);
-  Pass &= test<half, 16>(Q);
-  Pass &= test<half, 32>(Q);
+  if (Dev.has(aspect::fp16)) {
+    Pass &= test<half, 1>(Q);
+    Pass &= test<half, 2>(Q);
+    Pass &= test<half, 4>(Q);
+    Pass &= test<half, 8>(Q);
+    Pass &= test<half, 16>(Q);
+    Pass &= test<half, 32>(Q);
+  }
 
   Pass &= test<bfloat16, 1>(Q);
   Pass &= test<bfloat16, 2>(Q);
diff --git a/SYCL/ESIMD/api/unary_ops_heavy.cpp b/SYCL/ESIMD/api/unary_ops_heavy.cpp
index df14cecc0b..71206d0ed5 100644
--- a/SYCL/ESIMD/api/unary_ops_heavy.cpp
+++ b/SYCL/ESIMD/api/unary_ops_heavy.cpp
@@ -157,6 +157,8 @@ int main(void) {
   auto dev = q.get_device();
   std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
             << "\n";
+  const bool doublesSupported = dev.has(sycl::aspect::fp64);
+  const bool halfsSupported = dev.has(sycl::aspect::fp16);
   bool passed = true;
   using UnOp = esimd_test::UnaryOp;
 
@@ -171,10 +173,12 @@ int main(void) {
   passed &= test<unsigned int, 8>(mod_ops, q);
   passed &= test<int64_t, 16>(mod_ops, q);
   passed &= test<uint64_t, 1>(mod_ops, q);
-  passed &= test<half, 1>(mod_ops, q);
-  passed &= test<half, 32>(mod_ops, q);
+  if (halfsSupported)
+    passed &= test<half, 1>(mod_ops, q);
+  if (halfsSupported)
+    passed &= test<half, 32>(mod_ops, q);
   passed &= test<float, 32>(mod_ops, q);
-  if (dev.has(aspect::fp64))
+  if (doublesSupported)
     passed &= test<double, 7>(mod_ops, q);
 
   auto signed_ops = esimd_test::OpSeq<UnOp, UnOp::minus, UnOp::plus>{};
@@ -182,9 +186,10 @@ int main(void) {
   passed &= test<short, 7>(signed_ops, q);
   passed &= test<int, 16>(signed_ops, q);
   passed &= test<int64_t, 16>(signed_ops, q);
-  passed &= test<half, 16>(signed_ops, q);
+  if (halfsSupported)
+    passed &= test<half, 16>(signed_ops, q);
   passed &= test<float, 16>(signed_ops, q);
-  if (dev.has(aspect::fp64))
+  if (doublesSupported)
     passed &= test<double, 16>(signed_ops, q);
 
 #ifdef USE_BF16
diff --git a/SYCL/ESIMD/printf.cpp b/SYCL/ESIMD/printf.cpp
index 4a65166ec5..0714852fe6 100644
--- a/SYCL/ESIMD/printf.cpp
+++ b/SYCL/ESIMD/printf.cpp
@@ -10,9 +10,17 @@
 // UNSUPPORTED: cuda || hip
 // CUDA and HIP don't support printf.
 //
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER
 //
+// clang-format off
+// Test using __SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__ is disabled until ESIMD
+// supports it.
+// FIXME: enable that test
+// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -D__SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__ %s -o %t_nonvar.out
+// RUNx: %GPU_RUN_PLACEHOLDER %t_nonvar.out %GPU_CHECK_PLACEHOLDER
+// clang-format on
+//
 //===----------------------------------------------------------------------===//
 //
 // The test checks that ESIMD kernels support printf functionality.
@@ -44,10 +52,8 @@ static const CONSTANT char format_hello_world[] = "Hello, World!\n";
 const CONSTANT char format_int[] = "%d\n";
 
 int main() {
+  queue Queue(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
   {
-    queue Queue(esimd_test::ESIMDSelector,
-                esimd_test::createExceptionHandler());
-
     Queue.submit([&](handler &CGH) {
       CGH.single_task([=]() SYCL_ESIMD_KERNEL {
         // String
@@ -60,17 +66,6 @@ int main() {
         // CHECK-NEXT: 123
         // CHECK-NEXT: -123
 
-        // Floating point types
-        {
-          // You can declare format string in non-global scope, but in this case
-          // static keyword is required
-          static const CONSTANT char format[] = "%f\n";
-          oneapi::experimental::printf(format, 33.4f);
-          oneapi::experimental::printf(format, -33.4f);
-        }
-        // CHECK-NEXT: 33.4
-        // CHECK-NEXT: -33.4
-
         // String types
         {
           static CONSTANT const char str_arg[] = "foo";
@@ -83,22 +78,59 @@ int main() {
     Queue.wait();
   }
 
+#ifndef __SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__
+  // Currently printf will promote floating point values to doubles.
+  // __SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__ changes the behavior to not use
+  // a variadic function, so if it is defined it will not promote the floating
+  // point arguments.
+  if (Queue.get_device().has(sycl::aspect::fp64))
+#endif // __SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__
+  {
+    Queue.submit([&](handler &CGH) {
+      CGH.single_task<class floating_points>([=]() {
+        // Floating point types
+        {
+          // You can declare format string in non-global scope, but in this case
+          // static keyword is required
+          static const CONSTANT char format[] = "%.1f\n";
+          ext::oneapi::experimental::printf(format, 33.4f);
+          ext::oneapi::experimental::printf(format, -33.4f);
+        }
+      });
+    });
+    Queue.wait();
+  }
+#ifndef __SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__
+  else {
+    std::cout << "Skipped floating point test." << std::endl;
+    std::cout << "Skipped floating point test." << std::endl;
+  }
+#endif // __SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__
+  // CHECK-NEXT: {{(33.4|Skipped floating point test.)}}
+  // CHECK-NEXT: {{(-33.4|Skipped floating point test.)}}
+
   {
-    queue Q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
     // printf in parallel_for
     constexpr int SIMD_SIZE = 16;
     constexpr int WORK_SIZE = SIMD_SIZE;
-    int *Mem = malloc_shared<int>(WORK_SIZE * SIMD_SIZE, Q);
+    int *Mem = malloc_shared<int>(WORK_SIZE * SIMD_SIZE, Queue);
     for (int I = 0; I < WORK_SIZE * SIMD_SIZE; I++)
       Mem[I] = I;
     std::cout << "Start parallel_for:" << std::endl;
-    Q.parallel_for(range<1>(WORK_SIZE), [=](id<1> i) SYCL_ESIMD_KERNEL {
-       static const CONSTANT char STR_LU_D[] = "Thread-id: %d, Value: %d\n";
-       ext::intel::esimd::simd<int, SIMD_SIZE> Vec(Mem + i * SIMD_SIZE);
-       // cast to uint64_t to be sure that we pass 64-bit unsigned value
-       oneapi::experimental::printf(STR_LU_D, (uint64_t)i[0], (int)Vec[i]);
-     }).wait();
-    free(Mem, Q);
+    Queue
+        .parallel_for(range<1>(WORK_SIZE),
+                      [=](id<1> i) SYCL_ESIMD_KERNEL {
+                        static const CONSTANT char STR_LU_D[] =
+                            "Thread-id: %d, Value: %d\n";
+                        ext::intel::esimd::simd<int, SIMD_SIZE> Vec(
+                            Mem + i * SIMD_SIZE);
+                        // cast to uint64_t to be sure that we pass 64-bit
+                        // unsigned value
+                        oneapi::experimental::printf(STR_LU_D, (uint64_t)i[0],
+                                                     (int)Vec[i]);
+                      })
+        .wait();
+    free(Mem, Queue);
     // CHECK-LABEL: Start parallel_for
     // CHECK-DAG: Thread-id: 0, Value: 0
     // CHECK-DAG: Thread-id: 1, Value: 17
diff --git a/SYCL/ESIMD/regression/half_conversion_test.cpp b/SYCL/ESIMD/regression/half_conversion_test.cpp
index 7be3dce9d1..7a661fa5fc 100644
--- a/SYCL/ESIMD/regression/half_conversion_test.cpp
+++ b/SYCL/ESIMD/regression/half_conversion_test.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: gpu
 // UNSUPPORTED: cuda || hip
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 //==- half_conversion_test.cpp - Test for half conversion under ESIMD_EMULATOR
 // backend -==/
@@ -81,7 +81,11 @@ int main(int argc, char *argv[]) {
   std::cout << "\n===================" << std::endl;
   passed &= test<short>(q, 1);
   std::cout << "\n===================" << std::endl;
-  passed &= test<half>(q, 1);
+  if (q.get_device().has(sycl::aspect::fp16))
+    passed &= test<half>(q, 1);
+  else
+    std::cout << "Half case skipped as the device does not support fp16."
+              << std::endl;
   std::cout << "\n===================" << std::endl;
   passed &= test<float>(q, 1);
   std::cout << "\n===================" << std::endl;
diff --git a/SYCL/ESIMD/regression/store_zero_const.cpp b/SYCL/ESIMD/regression/store_zero_const.cpp
index dcb5896137..a6a27527f1 100644
--- a/SYCL/ESIMD/regression/store_zero_const.cpp
+++ b/SYCL/ESIMD/regression/store_zero_const.cpp
@@ -91,7 +91,8 @@ int main(void) {
   Passed &= test<uint64_t>(Q);
   Passed &= test<int64_t>(Q);
   Passed &= test<float>(Q);
-  Passed &= test<double>(Q);
+  if (Q.get_device().has(aspect::fp64))
+    Passed &= test<double>(Q);
 
   std::cout << (Passed ? "Passed\n" : "FAILED\n");
   return Passed ? 0 : 1;
diff --git a/SYCL/ESIMD/vadd_half.cpp b/SYCL/ESIMD/vadd_half.cpp
index 1482aae505..596feccb81 100644
--- a/SYCL/ESIMD/vadd_half.cpp
+++ b/SYCL/ESIMD/vadd_half.cpp
@@ -59,6 +59,13 @@ int main(int argc, char **argv) {
   auto dev = q.get_device();
   std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
 
+  if (!dev.has(sycl::aspect::fp16)) {
+    std::cout << "Test was skipped becasue the selected device does not "
+                 "support sycl::aspect::fp16"
+              << std::endl;
+    return 0;
+  }
+
   TstT *A = malloc_shared<TstT>(Size, q);
   SrcT *B = malloc_shared<SrcT>(Size, q);
   using DstT = __ESIMD_DNS::computation_type_t<TstT, SrcT>;
diff --git a/SYCL/GroupAlgorithm/SYCL2020/group_broadcast.cpp b/SYCL/GroupAlgorithm/SYCL2020/group_broadcast.cpp
index 6c230fa4c8..a8ace052d9 100644
--- a/SYCL/GroupAlgorithm/SYCL2020/group_broadcast.cpp
+++ b/SYCL/GroupAlgorithm/SYCL2020/group_broadcast.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
@@ -84,7 +84,7 @@ int main() {
     std::fill(output.begin(), output.end(), std::complex<float>(0, 0));
     test<class KernelName_rCblcml>(q, input, output);
   }
-  {
+  if (q.get_device().has(sycl::aspect::fp64)) {
     std::array<std::complex<double>, N> input;
     std::array<std::complex<double>, 3> output;
     for (int i = 0; i < N; ++i) {
diff --git a/SYCL/GroupAlgorithm/broadcast.cpp b/SYCL/GroupAlgorithm/broadcast.cpp
index 9d80740d17..a929bd2f4f 100644
--- a/SYCL/GroupAlgorithm/broadcast.cpp
+++ b/SYCL/GroupAlgorithm/broadcast.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
@@ -84,7 +84,7 @@ int main() {
     std::fill(output.begin(), output.end(), std::complex<float>(0, 0));
     test<class KernelName_rCblcml>(q, input, output);
   }
-  {
+  if (q.get_device().has(sycl::aspect::fp64)) {
     std::array<std::complex<double>, N> input;
     std::array<std::complex<double>, 3> output;
     for (int i = 0; i < N; ++i) {
diff --git a/SYCL/Regression/local-arg-align.cpp b/SYCL/Regression/local-arg-align.cpp
index f064a73c3b..ecd4fea37e 100644
--- a/SYCL/Regression/local-arg-align.cpp
+++ b/SYCL/Regression/local-arg-align.cpp
@@ -24,15 +24,15 @@ int main(int argc, char *argv[]) {
   buffer<size_t, 1> res(2);
 
   q.submit([&](sycl::handler &h) {
-     // Use two local buffers, one with an int and one with a double4
+     // Use two local buffers, one with an int and one with a float4
      local_accessor<cl_int, 1> a(1, h);
-     local_accessor<double4, 1> b(1, h);
+     local_accessor<float4, 1> b(1, h);
 
      auto ares = res.get_access<access::mode::read_write>(h);
 
      // Manually capture kernel arguments to ensure an order with the int
-     // argument first and the double4 argument second. If the two arguments are
-     // simply laid out consecutively, the double4 argument will not be
+     // argument first and the float4 argument second. If the two arguments are
+     // simply laid out consecutively, the float4 argument will not be
      // correctly aligned.
      h.parallel_for(1, [a, b, ares](sycl::id<1> i) {
        // Get the addresses of the two local buffers
@@ -52,10 +52,10 @@ int main(int argc, char *argv[]) {
     ret = -1;
   }
 
-  if (hres[1] % sizeof(double4) != 0) {
+  if (hres[1] % sizeof(float4) != 0) {
     std::cout
         << "Error: incorrect alignment for argument b, required alignment: "
-        << sizeof(double4) << ", address: " << (void *)hres[1] << std::endl;
+        << sizeof(float4) << ", address: " << (void *)hres[1] << std::endl;
     ret = -1;
   }
 
diff --git a/SYCL/SubGroup/broadcast_fp16.cpp b/SYCL/SubGroup/broadcast_fp16.cpp
index cde8e94477..4e2a35f660 100644
--- a/SYCL/SubGroup/broadcast_fp16.cpp
+++ b/SYCL/SubGroup/broadcast_fp16.cpp
@@ -13,7 +13,12 @@
 
 int main() {
   queue Queue;
-  check<sycl::half>(Queue);
-  std::cout << "Test passed." << std::endl;
+  if (Queue.get_device().has(sycl::aspect::fp16)) {
+    check<sycl::half>(Queue);
+    std::cout << "Test passed." << std::endl;
+  } else {
+    std::cout << "Test skipped because device doesn't support aspect::fp16"
+              << std::endl;
+  }
   return 0;
 }
diff --git a/SYCL/SubGroup/generic-shuffle.cpp b/SYCL/SubGroup/generic-shuffle.cpp
index e979ddfa85..76be56ef3e 100644
--- a/SYCL/SubGroup/generic-shuffle.cpp
+++ b/SYCL/SubGroup/generic-shuffle.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl-device-code-split=per_kernel -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
@@ -225,11 +225,17 @@ int main() {
   check_struct<class KernelName_zHfIPOLOFsXiZiCvG, std::complex<float>>(
       Queue, ComplexFloatGenerator);
 
-  auto ComplexDoubleGenerator = [state = std::complex<double>(0, 1)]() mutable {
-    return state += std::complex<double>(2, 2);
-  };
-  check_struct<class KernelName_CjlHUmnuxWtyejZFD, std::complex<double>>(
-      Queue, ComplexDoubleGenerator);
+  if (Queue.get_device().has(sycl::aspect::fp64)) {
+    auto ComplexDoubleGenerator = [state =
+                                       std::complex<double>(0, 1)]() mutable {
+      return state += std::complex<double>(2, 2);
+    };
+    check_struct<class KernelName_CjlHUmnuxWtyejZFD, std::complex<double>>(
+        Queue, ComplexDoubleGenerator);
+  } else {
+    std::cout << "fp64 tests were skipped due to the device not supporting the "
+                 "aspect.";
+  }
 
   std::cout << "Test passed." << std::endl;
   return 0;
diff --git a/SYCL/SubGroup/shuffle_fp16.cpp b/SYCL/SubGroup/shuffle_fp16.cpp
index da52ed3336..70224da203 100644
--- a/SYCL/SubGroup/shuffle_fp16.cpp
+++ b/SYCL/SubGroup/shuffle_fp16.cpp
@@ -22,7 +22,12 @@
 
 int main() {
   queue Queue;
-  check<half>(Queue);
-  std::cout << "Test passed." << std::endl;
+  if (Queue.get_device().has(sycl::aspect::fp16)) {
+    check<half>(Queue);
+    std::cout << "Test passed." << std::endl;
+  } else {
+    std::cout << "Test skipped because device doesn't support aspect::fp16"
+              << std::endl;
+  }
   return 0;
 }
diff --git a/SYCL/USM/copy.cpp b/SYCL/USM/copy.cpp
index edc543cd28..616168ff70 100644
--- a/SYCL/USM/copy.cpp
+++ b/SYCL/USM/copy.cpp
@@ -20,20 +20,48 @@ template <typename T> class transfer;
 
 static constexpr int N = 100; // should be even
 
-struct test_struct {
+struct test_struct_minimum {
   short a;
   int b;
   long c;
   long long d;
-  half e;
   float f;
 };
 
-bool operator==(const test_struct &lhs, const test_struct &rhs) {
+bool operator==(const test_struct_minimum &lhs,
+                const test_struct_minimum &rhs) {
+  return lhs.a == rhs.a && lhs.b == rhs.b && lhs.c == rhs.c && lhs.d == rhs.d &&
+         lhs.f == rhs.f;
+}
+
+struct test_struct_all : public test_struct_minimum {
+  sycl::half e;
+  double g;
+};
+
+bool operator==(const test_struct_all &lhs, const test_struct_all &rhs) {
+  return lhs.a == rhs.a && lhs.b == rhs.b && lhs.c == rhs.c && lhs.d == rhs.d &&
+         lhs.e == rhs.e && lhs.f == rhs.f && lhs.g == rhs.g;
+}
+
+struct test_struct_half : public test_struct_minimum {
+  sycl::half e;
+};
+
+bool operator==(const test_struct_half &lhs, const test_struct_half &rhs) {
   return lhs.a == rhs.a && lhs.b == rhs.b && lhs.c == rhs.c && lhs.d == rhs.d &&
          lhs.e == rhs.e && lhs.f == rhs.f;
 }
 
+struct test_struct_double : public test_struct_minimum {
+  double g;
+};
+
+bool operator==(const test_struct_double &lhs, const test_struct_double &rhs) {
+  return lhs.a == rhs.a && lhs.b == rhs.b && lhs.c == rhs.c && lhs.d == rhs.d &&
+         lhs.f == rhs.f && lhs.g == rhs.g;
+}
+
 template <typename T> T *regular(queue q, alloc kind) {
   return malloc<T>(N, q, kind);
 }
@@ -89,17 +117,34 @@ int main() {
   queue q;
   auto dev = q.get_device();
 
-  test_struct test_obj{4, 42, 424, 4242, 4.2f, 4.242f};
+  const bool DoublesSupported = dev.has(sycl::aspect::fp64);
+  const bool HalfsSupported = dev.has(sycl::aspect::fp16);
+
+  test_struct_all test_obj_all{4, 42, 424, 4242, 4.2f, 4.242, 4.24242};
+  test_struct_half test_obj_half{4, 42, 424, 4242, 4.2f, 4.242};
+  test_struct_double test_obj_double{4, 42, 424, 4242, 4.242, 4.24242};
+  test_struct_minimum test_obj_minimum{4, 42, 424, 4242, 4.242};
 
   if (dev.has(aspect::usm_host_allocations)) {
     runTests<short>(q, 4, alloc::host, alloc::host);
     runTests<int>(q, 42, alloc::host, alloc::host);
     runTests<long>(q, 424, alloc::host, alloc::host);
     runTests<long long>(q, 4242, alloc::host, alloc::host);
-    runTests<half>(q, half(4.2f), alloc::host, alloc::host);
+    if (HalfsSupported)
+      runTests<half>(q, half(4.2f), alloc::host, alloc::host);
     runTests<float>(q, 4.242f, alloc::host, alloc::host);
-    runTests<double>(q, 4.24242, alloc::host, alloc::host);
-    runTests<test_struct>(q, test_obj, alloc::host, alloc::host);
+    if (DoublesSupported)
+      runTests<double>(q, 4.24242, alloc::host, alloc::host);
+    if (HalfsSupported && DoublesSupported)
+      runTests<test_struct_all>(q, test_obj_all, alloc::host, alloc::host);
+    else if (HalfsSupported)
+      runTests<test_struct_half>(q, test_obj_half, alloc::host, alloc::host);
+    else if (DoublesSupported)
+      runTests<test_struct_double>(q, test_obj_double, alloc::host,
+                                   alloc::host);
+    else
+      runTests<test_struct_minimum>(q, test_obj_minimum, alloc::host,
+                                    alloc::host);
   }
 
   if (dev.has(aspect::usm_shared_allocations)) {
@@ -107,10 +152,22 @@ int main() {
     runTests<int>(q, 42, alloc::shared, alloc::shared);
     runTests<long>(q, 424, alloc::shared, alloc::shared);
     runTests<long long>(q, 4242, alloc::shared, alloc::shared);
-    runTests<half>(q, half(4.2f), alloc::shared, alloc::shared);
+    if (HalfsSupported)
+      runTests<half>(q, half(4.2f), alloc::shared, alloc::shared);
     runTests<float>(q, 4.242f, alloc::shared, alloc::shared);
-    runTests<double>(q, 4.24242, alloc::shared, alloc::shared);
-    runTests<test_struct>(q, test_obj, alloc::shared, alloc::shared);
+    if (DoublesSupported)
+      runTests<double>(q, 4.24242, alloc::shared, alloc::shared);
+    if (HalfsSupported && DoublesSupported)
+      runTests<test_struct_all>(q, test_obj_all, alloc::shared, alloc::shared);
+    else if (HalfsSupported)
+      runTests<test_struct_half>(q, test_obj_half, alloc::shared,
+                                 alloc::shared);
+    else if (DoublesSupported)
+      runTests<test_struct_double>(q, test_obj_double, alloc::shared,
+                                   alloc::shared);
+    else
+      runTests<test_struct_minimum>(q, test_obj_minimum, alloc::shared,
+                                    alloc::shared);
   }
 
   if (dev.has(aspect::usm_device_allocations)) {
@@ -118,10 +175,22 @@ int main() {
     runTests<int>(q, 42, alloc::device, alloc::device);
     runTests<long>(q, 424, alloc::device, alloc::device);
     runTests<long long>(q, 4242, alloc::device, alloc::device);
-    runTests<half>(q, half(4.2f), alloc::device, alloc::device);
+    if (HalfsSupported)
+      runTests<half>(q, half(4.2f), alloc::device, alloc::device);
     runTests<float>(q, 4.242f, alloc::device, alloc::device);
-    runTests<double>(q, 4.24242, alloc::device, alloc::device);
-    runTests<test_struct>(q, test_obj, alloc::device, alloc::device);
+    if (DoublesSupported)
+      runTests<double>(q, 4.24242, alloc::device, alloc::device);
+    if (HalfsSupported && DoublesSupported)
+      runTests<test_struct_all>(q, test_obj_all, alloc::device, alloc::device);
+    else if (HalfsSupported)
+      runTests<test_struct_half>(q, test_obj_half, alloc::device,
+                                 alloc::device);
+    else if (DoublesSupported)
+      runTests<test_struct_double>(q, test_obj_double, alloc::device,
+                                   alloc::device);
+    else
+      runTests<test_struct_minimum>(q, test_obj_minimum, alloc::device,
+                                    alloc::device);
   }
 
   if (dev.has(aspect::usm_host_allocations) &&
@@ -130,10 +199,21 @@ int main() {
     runTests<int>(q, 42, alloc::host, alloc::shared);
     runTests<long>(q, 424, alloc::host, alloc::shared);
     runTests<long long>(q, 4242, alloc::host, alloc::shared);
-    runTests<half>(q, half(4.2f), alloc::host, alloc::shared);
+    if (HalfsSupported)
+      runTests<half>(q, half(4.2f), alloc::host, alloc::shared);
     runTests<float>(q, 4.242f, alloc::host, alloc::shared);
-    runTests<double>(q, 4.24242, alloc::host, alloc::shared);
-    runTests<test_struct>(q, test_obj, alloc::host, alloc::shared);
+    if (DoublesSupported)
+      runTests<double>(q, 4.24242, alloc::host, alloc::shared);
+    if (HalfsSupported && DoublesSupported)
+      runTests<test_struct_all>(q, test_obj_all, alloc::host, alloc::shared);
+    else if (HalfsSupported)
+      runTests<test_struct_half>(q, test_obj_half, alloc::host, alloc::shared);
+    else if (DoublesSupported)
+      runTests<test_struct_double>(q, test_obj_double, alloc::host,
+                                   alloc::shared);
+    else
+      runTests<test_struct_minimum>(q, test_obj_minimum, alloc::host,
+                                    alloc::shared);
   }
 
   if (dev.has(aspect::usm_host_allocations) &&
@@ -142,10 +222,21 @@ int main() {
     runTests<int>(q, 42, alloc::host, alloc::device);
     runTests<long>(q, 424, alloc::host, alloc::device);
     runTests<long long>(q, 4242, alloc::host, alloc::device);
-    runTests<half>(q, half(4.2f), alloc::host, alloc::device);
+    if (HalfsSupported)
+      runTests<half>(q, half(4.2f), alloc::host, alloc::device);
     runTests<float>(q, 4.242f, alloc::host, alloc::device);
-    runTests<double>(q, 4.24242, alloc::host, alloc::device);
-    runTests<test_struct>(q, test_obj, alloc::host, alloc::device);
+    if (DoublesSupported)
+      runTests<double>(q, 4.24242, alloc::host, alloc::device);
+    if (HalfsSupported && DoublesSupported)
+      runTests<test_struct_all>(q, test_obj_all, alloc::host, alloc::device);
+    else if (HalfsSupported)
+      runTests<test_struct_half>(q, test_obj_half, alloc::host, alloc::device);
+    else if (DoublesSupported)
+      runTests<test_struct_double>(q, test_obj_double, alloc::host,
+                                   alloc::device);
+    else
+      runTests<test_struct_minimum>(q, test_obj_minimum, alloc::host,
+                                    alloc::device);
   }
 
   if (dev.has(aspect::usm_shared_allocations) &&
@@ -154,10 +245,22 @@ int main() {
     runTests<int>(q, 42, alloc::shared, alloc::device);
     runTests<long>(q, 424, alloc::shared, alloc::device);
     runTests<long long>(q, 4242, alloc::shared, alloc::device);
-    runTests<half>(q, half(4.2f), alloc::shared, alloc::device);
+    if (HalfsSupported)
+      runTests<half>(q, half(4.2f), alloc::shared, alloc::device);
     runTests<float>(q, 4.242f, alloc::shared, alloc::device);
-    runTests<double>(q, 4.24242, alloc::shared, alloc::device);
-    runTests<test_struct>(q, test_obj, alloc::shared, alloc::device);
+    if (DoublesSupported)
+      runTests<double>(q, 4.24242, alloc::shared, alloc::device);
+    if (HalfsSupported && DoublesSupported)
+      runTests<test_struct_all>(q, test_obj_all, alloc::shared, alloc::device);
+    else if (HalfsSupported)
+      runTests<test_struct_half>(q, test_obj_half, alloc::shared,
+                                 alloc::device);
+    else if (DoublesSupported)
+      runTests<test_struct_double>(q, test_obj_double, alloc::shared,
+                                   alloc::device);
+    else
+      runTests<test_struct_minimum>(q, test_obj_minimum, alloc::shared,
+                                    alloc::device);
   }
 
   return 0;
diff --git a/SYCL/USM/fill.cpp b/SYCL/USM/fill.cpp
index 49947fcc3d..a991db5968 100644
--- a/SYCL/USM/fill.cpp
+++ b/SYCL/USM/fill.cpp
@@ -20,21 +20,48 @@ template <typename T> class usm_aligned_device_transfer;
 
 static constexpr int N = 100;
 
-struct test_struct {
+struct test_struct_minimum {
   short a;
   int b;
   long c;
   long long d;
-  sycl::half e;
   float f;
+};
+
+bool operator==(const test_struct_minimum &lhs,
+                const test_struct_minimum &rhs) {
+  return lhs.a == rhs.a && lhs.b == rhs.b && lhs.c == rhs.c && lhs.d == rhs.d &&
+         lhs.f == rhs.f;
+}
+
+struct test_struct_all : public test_struct_minimum {
+  sycl::half e;
   double g;
 };
 
-bool operator==(const test_struct &lhs, const test_struct &rhs) {
+bool operator==(const test_struct_all &lhs, const test_struct_all &rhs) {
   return lhs.a == rhs.a && lhs.b == rhs.b && lhs.c == rhs.c && lhs.d == rhs.d &&
          lhs.e == rhs.e && lhs.f == rhs.f && lhs.g == rhs.g;
 }
 
+struct test_struct_half : public test_struct_minimum {
+  sycl::half e;
+};
+
+bool operator==(const test_struct_half &lhs, const test_struct_half &rhs) {
+  return lhs.a == rhs.a && lhs.b == rhs.b && lhs.c == rhs.c && lhs.d == rhs.d &&
+         lhs.e == rhs.e && lhs.f == rhs.f;
+}
+
+struct test_struct_double : public test_struct_minimum {
+  double g;
+};
+
+bool operator==(const test_struct_double &lhs, const test_struct_double &rhs) {
+  return lhs.a == rhs.a && lhs.b == rhs.b && lhs.c == rhs.c && lhs.d == rhs.d &&
+         lhs.f == rhs.f && lhs.g == rhs.g;
+}
+
 template <typename T>
 void runHostTests(device dev, context ctxt, queue q, T val) {
   T *array;
@@ -125,20 +152,31 @@ int main() {
   auto ctxt = q.get_context();
 
   const bool DoublesSupported = dev.has(sycl::aspect::fp64);
+  const bool HalfsSupported = dev.has(sycl::aspect::fp16);
 
-  test_struct test_obj{4, 42, 424, 4242, 4.2f, 4.242, 4.24242};
+  test_struct_all test_obj_all{4, 42, 424, 4242, 4.2f, 4.242, 4.24242};
+  test_struct_half test_obj_half{4, 42, 424, 4242, 4.2f, 4.242};
+  test_struct_double test_obj_double{4, 42, 424, 4242, 4.242, 4.24242};
+  test_struct_minimum test_obj_minimum{4, 42, 424, 4242, 4.242};
 
   if (dev.get_info<info::device::usm_host_allocations>()) {
     runHostTests<short>(dev, ctxt, q, 4);
     runHostTests<int>(dev, ctxt, q, 42);
     runHostTests<long>(dev, ctxt, q, 424);
     runHostTests<long long>(dev, ctxt, q, 4242);
-    runHostTests<sycl::half>(dev, ctxt, q, sycl::half(4.2f));
+    if (HalfsSupported)
+      runHostTests<sycl::half>(dev, ctxt, q, sycl::half(4.2f));
     runHostTests<float>(dev, ctxt, q, 4.242f);
-    if (DoublesSupported) {
+    if (DoublesSupported)
       runHostTests<double>(dev, ctxt, q, 4.24242);
-      runHostTests<test_struct>(dev, ctxt, q, test_obj);
-    }
+    if (HalfsSupported && DoublesSupported)
+      runHostTests<test_struct_all>(dev, ctxt, q, test_obj_all);
+    else if (HalfsSupported)
+      runHostTests<test_struct_half>(dev, ctxt, q, test_obj_half);
+    else if (DoublesSupported)
+      runHostTests<test_struct_double>(dev, ctxt, q, test_obj_double);
+    else
+      runHostTests<test_struct_minimum>(dev, ctxt, q, test_obj_minimum);
   }
 
   if (dev.get_info<info::device::usm_shared_allocations>()) {
@@ -146,12 +184,19 @@ int main() {
     runSharedTests<int>(dev, ctxt, q, 42);
     runSharedTests<long>(dev, ctxt, q, 424);
     runSharedTests<long long>(dev, ctxt, q, 4242);
-    runSharedTests<sycl::half>(dev, ctxt, q, sycl::half(4.2f));
+    if (HalfsSupported)
+      runSharedTests<sycl::half>(dev, ctxt, q, sycl::half(4.2f));
     runSharedTests<float>(dev, ctxt, q, 4.242f);
-    if (DoublesSupported) {
+    if (DoublesSupported)
       runSharedTests<double>(dev, ctxt, q, 4.24242);
-      runSharedTests<test_struct>(dev, ctxt, q, test_obj);
-    }
+    if (HalfsSupported && DoublesSupported)
+      runSharedTests<test_struct_all>(dev, ctxt, q, test_obj_all);
+    else if (HalfsSupported)
+      runSharedTests<test_struct_half>(dev, ctxt, q, test_obj_half);
+    else if (DoublesSupported)
+      runSharedTests<test_struct_double>(dev, ctxt, q, test_obj_double);
+    else
+      runSharedTests<test_struct_minimum>(dev, ctxt, q, test_obj_minimum);
   }
 
   if (dev.get_info<info::device::usm_device_allocations>()) {
@@ -159,12 +204,19 @@ int main() {
     runDeviceTests<int>(dev, ctxt, q, 42);
     runDeviceTests<long>(dev, ctxt, q, 420);
     runDeviceTests<long long>(dev, ctxt, q, 4242);
-    runDeviceTests<sycl::half>(dev, ctxt, q, sycl::half(4.2f));
+    if (HalfsSupported)
+      runDeviceTests<sycl::half>(dev, ctxt, q, sycl::half(4.2f));
     runDeviceTests<float>(dev, ctxt, q, 4.242f);
-    if (DoublesSupported) {
+    if (DoublesSupported)
       runDeviceTests<double>(dev, ctxt, q, 4.24242);
-      runDeviceTests<test_struct>(dev, ctxt, q, test_obj);
-    }
+    if (HalfsSupported && DoublesSupported)
+      runDeviceTests<test_struct_all>(dev, ctxt, q, test_obj_all);
+    else if (HalfsSupported)
+      runDeviceTests<test_struct_half>(dev, ctxt, q, test_obj_half);
+    else if (DoublesSupported)
+      runDeviceTests<test_struct_double>(dev, ctxt, q, test_obj_double);
+    else
+      runDeviceTests<test_struct_minimum>(dev, ctxt, q, test_obj_minimum);
   }
 
   return 0;