diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index d7b1dcf72e8ca..47b9d4e39f1a6 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -911,7 +911,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
       PB.registerPipelineStartEPCallback(
           [&](ModulePassManager &MPM, OptimizationLevel Level) {
             MPM.addPass(ESIMDVerifierPass(LangOpts.SYCLESIMDForceStatelessMem));
-            MPM.addPass(SYCLPropagateAspectsUsagePass());
+            MPM.addPass(
+                SYCLPropagateAspectsUsagePass(/*ExcludeAspects=*/{"fp64"}));
           });
 
     // Add the InferAddressSpaces pass for all the SPIR[V] targets
@@ -1026,6 +1027,10 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
       if (LangOpts.EnableDAEInSpirKernels)
         MPM.addPass(DeadArgumentEliminationSYCLPass());
 
+      // Rerun aspect propagation without warning diagnostics.
+      MPM.addPass(SYCLPropagateAspectsUsagePass(/*ExcludeAspects=*/{},
+                                                /*ValidateAspects=*/false));
+
       // Add SPIRITTAnnotations pass to the pass manager if
       // -fsycl-instrument-device-code option was passed. This option can be
       // used only with spir triple.
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 718f6565ee23f..6ef967dac4c1f 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -591,6 +591,13 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     ToolChain.addFastMathRuntimeIfAvailable(Args, CmdArgs);
   }
 
+  // Performing link for dependency file information, undefined symbols are OK.
+  // True link time errors for symbols will be captured at host link.
+  if (JA.getType() == types::TY_Host_Dependencies_Image) {
+    CmdArgs.push_back("-z");
+    CmdArgs.push_back("undefs");
+  }
+
   Args.AddAllArgs(CmdArgs, options::OPT_L);
   Args.AddAllArgs(CmdArgs, options::OPT_u);
 
diff --git a/clang/test/CodeGenSYCL/fpga-attr-do-while-loops.cpp b/clang/test/CodeGenSYCL/fpga-attr-do-while-loops.cpp
index ae8dc07ae5006..91414bcfe87ea 100644
--- a/clang/test/CodeGenSYCL/fpga-attr-do-while-loops.cpp
+++ b/clang/test/CodeGenSYCL/fpga-attr-do-while-loops.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang++ -fsycl-device-only -fintelfpga -S %s -o - | FileCheck %s
+// RUN: %clang -fsycl-device-only -fintelfpga -S %s -o - | FileCheck %s
 
 #include "Inputs/sycl.hpp"
 
diff --git a/clang/test/Driver/sycl-offload-intelfpga-link.cpp b/clang/test/Driver/sycl-offload-intelfpga-link.cpp
index d1fdf5aa89101..ae143a8258a9e 100644
--- a/clang/test/Driver/sycl-offload-intelfpga-link.cpp
+++ b/clang/test/Driver/sycl-offload-intelfpga-link.cpp
@@ -105,6 +105,14 @@
 // RUN:  | FileCheck %s --check-prefix=CHK-FPGA-LINK-WARN-AOCR
 // CHK-FPGA-LINK-WARN-AOCR: warning: FPGA archive '{{.*}}-aocr.a' does not contain matching emulation/hardware expectancy
 
+/// Check deps behaviors with input fat archive and creating aocx archive
+// RUN: %clangxx -fsycl -fintelfpga -fsycl-link=image \
+// RUN:          -target x86_64-unknown-linux-gnu %S/Inputs/SYCL/liblin64.a \
+// RUN:          %s -### 2>&1 \
+// RUN:  | FileCheck %s --check-prefix=CHK-FPGA-LINK-UNDEFS
+// CHK-FPGA-LINK-UNDEFS: ld{{.*}} "-z" "undefs"
+// CHK-FPGA-LINK-UNDEFS: clang-offload-deps{{.*}}
+
 /// -fintelfpga -fsycl-link from source
 // RUN: touch %t.cpp
 // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl -fno-sycl-instrument-device-code -fno-sycl-device-lib=all -fintelfpga -fsycl-link=early %t.cpp -ccc-print-phases 2>&1 \
diff --git a/clang/test/Driver/sycl-offload-static-lib-2.cpp b/clang/test/Driver/sycl-offload-static-lib-2.cpp
index ef9c5043306ac..7332b487e677b 100644
--- a/clang/test/Driver/sycl-offload-static-lib-2.cpp
+++ b/clang/test/Driver/sycl-offload-static-lib-2.cpp
@@ -150,7 +150,7 @@
 // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl -fsycl-targets=nvptx64-nvidia-cuda %t_lib.a -o output_name -lOpenCL -### %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=STATIC_LIB_SRC2 -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_50 -DDEPS_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_50
 // STATIC_LIB_SRC2: clang{{.*}} "-emit-obj" {{.*}} "-o" "[[HOSTOBJ:.+\.o]]"
-// STATIC_LIB_SRC2: ld{{(.exe)?}}" {{.*}} "-o" "[[HOSTEXE:.+\.out]]"
+// STATIC_LIB_SRC2: ld{{(.exe)?}}" {{.*}} "-o" "[[HOSTEXE:.+\.out]]" {{.*}}"-z" "undefs"
 // STATIC_LIB_SRC2: clang-offload-deps{{.*}} "-targets=[[DEPS_TRIPLE]]" "-outputs=[[OUTDEPS:.+\.bc]]" "[[HOSTEXE]]"
 // STATIC_LIB_SRC2_DEF: clang-offload-bundler{{.*}} "-type=aoo" "-targets=[[BUNDLE_TRIPLE]]" {{.*}} "-output=[[OUTLIB:.+\.txt]]"
 // STATIC_LIB_SRC2_NVPTX: clang-offload-bundler{{.*}} "-type=a" "-targets=[[BUNDLE_TRIPLE]]" {{.*}} "-output=[[OUTLIB:.+\.a]]"
diff --git a/libclc/generic/include/spirv/spirv_types.h b/libclc/generic/include/spirv/spirv_types.h
index be7101dadae29..d45e089032b2b 100644
--- a/libclc/generic/include/spirv/spirv_types.h
+++ b/libclc/generic/include/spirv/spirv_types.h
@@ -46,4 +46,16 @@ enum GroupOperation {
   ExclusiveScan = 2,
 };
 
+typedef struct {
+  float real, imag;
+} complex_float;
+
+typedef struct {
+  double real, imag;
+} complex_double;
+
+typedef struct {
+  half real, imag;
+} complex_half;
+
 #endif // CLC_SPIRV_TYPES
diff --git a/libclc/ptx-nvidiacl/libspirv/group/collectives.cl b/libclc/ptx-nvidiacl/libspirv/group/collectives.cl
index cf2bd33e987c4..6449152ecb34d 100644
--- a/libclc/ptx-nvidiacl/libspirv/group/collectives.cl
+++ b/libclc/ptx-nvidiacl/libspirv/group/collectives.cl
@@ -41,6 +41,12 @@ __local float *
 __clc__get_group_scratch_float() __asm("__clc__get_group_scratch_float");
 __local double *
 __clc__get_group_scratch_double() __asm("__clc__get_group_scratch_double");
+__local complex_half *__clc__get_group_scratch_complex_half() __asm(
+    "__clc__get_group_scratch_complex_half");
+__local complex_float *__clc__get_group_scratch_complex_float() __asm(
+    "__clc__get_group_scratch_complex_float");
+__local complex_double *__clc__get_group_scratch_complex_double() __asm(
+    "__clc__get_group_scratch_complex_double");
 
 _CLC_DEF uint inline __clc__membermask() {
   // use a full mask as sync operations are required to be convergent and
@@ -89,6 +95,46 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT double __clc__SubgroupShuffle(double x,
   return as_double(__clc__SubgroupShuffle(as_ulong(x), idx));
 }
 
+typedef union {
+  complex_half h;
+  int i;
+} complex_half_converter;
+
+typedef union {
+  complex_float f;
+  int2 i;
+} complex_float_converter;
+
+typedef union {
+  complex_double d;
+  int4 i;
+} complex_double_converter;
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT complex_half
+__clc__SubgroupShuffle(complex_half x, uint idx) {
+  complex_half_converter conv = {x};
+  conv.i = __clc__SubgroupShuffle(conv.i, idx);
+  return conv.h;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT complex_float
+__clc__SubgroupShuffle(complex_float x, uint idx) {
+  complex_float_converter conv = {x};
+  conv.i.x = __clc__SubgroupShuffle(conv.i.x, idx);
+  conv.i.y = __clc__SubgroupShuffle(conv.i.y, idx);
+  return conv.f;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT complex_double
+__clc__SubgroupShuffle(complex_double x, uint idx) {
+  complex_double_converter conv = {x};
+  conv.i.x = __clc__SubgroupShuffle(conv.i.x, idx);
+  conv.i.y = __clc__SubgroupShuffle(conv.i.y, idx);
+  conv.i.z = __clc__SubgroupShuffle(conv.i.z, idx);
+  conv.i.w = __clc__SubgroupShuffle(conv.i.w, idx);
+  return conv.d;
+}
+
 #define __CLC_SUBGROUP_SHUFFLEUP_I32(TYPE)                                     \
   _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __clc__SubgroupShuffleUp(        \
       TYPE x, uint delta) {                                                    \
@@ -130,6 +176,31 @@ __clc__SubgroupShuffleUp(double x, uint delta) {
   return as_double(__clc__SubgroupShuffleUp(as_ulong(x), delta));
 }
 
+_CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT complex_half
+__clc__SubgroupShuffleUp(complex_half x, uint delta) {
+  complex_half_converter conv = {x};
+  conv.i = __clc__SubgroupShuffleUp(conv.i, delta);
+  return conv.h;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT complex_float
+__clc__SubgroupShuffleUp(complex_float x, uint delta) {
+  complex_float_converter conv = {x};
+  conv.i.x = __clc__SubgroupShuffleUp(conv.i.x, delta);
+  conv.i.y = __clc__SubgroupShuffleUp(conv.i.y, delta);
+  return conv.f;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT complex_double
+__clc__SubgroupShuffleUp(complex_double x, uint delta) {
+  complex_double_converter conv = {x};
+  conv.i.x = __clc__SubgroupShuffleUp(conv.i.x, delta);
+  conv.i.y = __clc__SubgroupShuffleUp(conv.i.y, delta);
+  conv.i.z = __clc__SubgroupShuffleUp(conv.i.z, delta);
+  conv.i.w = __clc__SubgroupShuffleUp(conv.i.w, delta);
+  return conv.d;
+}
+
 // TODO: Implement InclusiveScan/ExclusiveScan
 //       Currently only Reduce is required (for GroupAny and GroupAll)
 _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT bool
@@ -155,6 +226,75 @@ __clc__SubgroupBitwiseAny(uint op, bool predicate, bool *carry) {
 #define __CLC_AND(x, y) (x & y)
 #define __CLC_MUL(x, y) (x * y)
 
+#define __DEFINE_CLC_COMPLEX_MUL(TYPE)                                         \
+  _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT complex_##TYPE __clc_complex_mul(     \
+      complex_##TYPE z, complex_##TYPE w) {                                    \
+    TYPE a = z.real;                                                           \
+    TYPE b = z.imag;                                                           \
+    TYPE c = w.real;                                                           \
+    TYPE d = w.imag;                                                           \
+    TYPE ac = a * c;                                                           \
+    TYPE bd = b * d;                                                           \
+    TYPE ad = a * d;                                                           \
+    TYPE bc = b * c;                                                           \
+    TYPE x = ac - bd;                                                          \
+    TYPE y = ad + bc;                                                          \
+    if (__spirv_IsNan(x) && __spirv_IsNan(y)) {                                \
+      bool __recalc = false;                                                   \
+      if (__spirv_IsInf(a) || __spirv_IsInf(b)) {                              \
+        a = __spirv_ocl_copysign(__spirv_IsInf(a) ? (TYPE)1 : (TYPE)0, a);     \
+        b = __spirv_ocl_copysign(__spirv_IsInf(b) ? (TYPE)1 : (TYPE)0, b);     \
+        if (__spirv_IsNan(c))                                                  \
+          c = __spirv_ocl_copysign((TYPE)0, c);                                \
+        if (__spirv_IsNan(d))                                                  \
+          d = __spirv_ocl_copysign((TYPE)0, d);                                \
+        __recalc = true;                                                       \
+      }                                                                        \
+      if (__spirv_IsInf(c) || __spirv_IsInf(d)) {                              \
+        c = __spirv_ocl_copysign(__spirv_IsInf(c) ? (TYPE)1 : (TYPE)0, c);     \
+        d = __spirv_ocl_copysign(__spirv_IsInf(d) ? (TYPE)1 : (TYPE)0, d);     \
+        if (__spirv_IsNan(a))                                                  \
+          a = __spirv_ocl_copysign((TYPE)0, a);                                \
+        if (__spirv_IsNan(b))                                                  \
+          b = __spirv_ocl_copysign((TYPE)0, b);                                \
+        __recalc = true;                                                       \
+      }                                                                        \
+      if (!__recalc && (__spirv_IsInf(ac) || __spirv_IsInf(bd) ||              \
+                        __spirv_IsInf(ad) || __spirv_IsInf(bc))) {             \
+        if (__spirv_IsNan(a))                                                  \
+          a = __spirv_ocl_copysign((TYPE)0, a);                                \
+        if (__spirv_IsNan(b))                                                  \
+          b = __spirv_ocl_copysign((TYPE)0, b);                                \
+        if (__spirv_IsNan(c))                                                  \
+          c = __spirv_ocl_copysign((TYPE)0, c);                                \
+        if (__spirv_IsNan(d))                                                  \
+          d = __spirv_ocl_copysign((TYPE)0, d);                                \
+        __recalc = true;                                                       \
+      }                                                                        \
+      if (__recalc) {                                                          \
+        x = (TYPE)INFINITY * (a * c - b * d);                                  \
+        y = (TYPE)INFINITY * (a * d + b * c);                                  \
+      }                                                                        \
+    }                                                                          \
+    return (complex_##TYPE){x, y};                                             \
+  }
+
+__DEFINE_CLC_COMPLEX_MUL(half)
+__DEFINE_CLC_COMPLEX_MUL(float)
+__DEFINE_CLC_COMPLEX_MUL(double)
+#undef __DEFINE_CLC_COMPLEX_MUL
+
+// TODO remove these definitions after we have proper implementation of
+// std::complex multiplication in SYCL
+complex_float __mulsc3(float a, float b, float c, float d) {
+  return __clc_complex_mul((complex_float){a, b}, (complex_float){c, d});
+}
+complex_double __muldc3(double a, double b, double c, double d) {
+  return __clc_complex_mul((complex_double){a, b}, (complex_double){c, d});
+}
+
+#define __CLC_COMPLEX_MUL(x, y) __clc_complex_mul(x, y)
+
 #define __CLC_SUBGROUP_COLLECTIVE_BODY(OP, TYPE, IDENTITY)                     \
   uint sg_lid = __spirv_SubgroupLocalInvocationId();                           \
   /* Can't use XOR/butterfly shuffles; some lanes may be inactive */           \
@@ -225,6 +365,13 @@ __CLC_SUBGROUP_COLLECTIVE(FMulKHR, __CLC_MUL, half, 1)
 __CLC_SUBGROUP_COLLECTIVE(FMulKHR, __CLC_MUL, float, 1)
 __CLC_SUBGROUP_COLLECTIVE(FMulKHR, __CLC_MUL, double, 1)
 
+__CLC_SUBGROUP_COLLECTIVE(CMulINTEL, __CLC_COMPLEX_MUL, complex_half,
+                          ((complex_half){1, 0}))
+__CLC_SUBGROUP_COLLECTIVE(CMulINTEL, __CLC_COMPLEX_MUL, complex_float,
+                          ((complex_float){1, 0}))
+__CLC_SUBGROUP_COLLECTIVE(CMulINTEL, __CLC_COMPLEX_MUL, complex_double,
+                          ((complex_double){1, 0}))
+
 __CLC_SUBGROUP_COLLECTIVE(SMin, __CLC_MIN, char, CHAR_MAX)
 __CLC_SUBGROUP_COLLECTIVE(UMin, __CLC_MIN, uchar, UCHAR_MAX)
 __CLC_SUBGROUP_COLLECTIVE(SMin, __CLC_MIN, short, SHRT_MAX)
@@ -281,55 +428,58 @@ __CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, long, 0l)
 #undef __CLC_SUBGROUP_COLLECTIVE
 #undef __CLC_SUBGROUP_COLLECTIVE_REDUX
 
-#define __CLC_GROUP_COLLECTIVE_INNER(SPIRV_NAME, CLC_NAME, OP, TYPE, IDENTITY) \
-  _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __CLC_APPEND(                    \
-      __spirv_Group, SPIRV_NAME)(uint scope, uint op, TYPE x) {                \
-    TYPE carry = IDENTITY;                                                     \
-    /* Perform GroupOperation within sub-group */                              \
-    TYPE sg_x = __CLC_APPEND(__clc__Subgroup, CLC_NAME)(op, x, &carry);        \
-    if (scope == Subgroup) {                                                   \
-      return sg_x;                                                             \
-    }                                                                          \
-    __local TYPE *scratch = __CLC_APPEND(__clc__get_group_scratch_, TYPE)();   \
-    uint sg_id = __spirv_SubgroupId();                                         \
-    uint num_sg = __spirv_NumSubgroups();                                      \
-    uint sg_lid = __spirv_SubgroupLocalInvocationId();                         \
-    uint sg_size = __spirv_SubgroupSize();                                     \
-    /* Share carry values across sub-groups */                                 \
-    if (sg_lid == sg_size - 1) {                                               \
-      scratch[sg_id] = carry;                                                  \
-    }                                                                          \
-    __spirv_ControlBarrier(Workgroup, 0, 0);                                   \
-    /* Perform InclusiveScan over sub-group results */                         \
-    TYPE sg_prefix;                                                            \
-    TYPE sg_aggregate = scratch[0];                                            \
-    _Pragma("unroll") for (int s = 1; s < num_sg; ++s) {                       \
-      if (sg_id == s) {                                                        \
-        sg_prefix = sg_aggregate;                                              \
-      }                                                                        \
-      TYPE addend = scratch[s];                                                \
-      sg_aggregate = OP(sg_aggregate, addend);                                 \
+#define __CLC_GROUP_COLLECTIVE_INNER(CLC_NAME, OP, TYPE, IDENTITY)             \
+  TYPE carry = IDENTITY;                                                       \
+  /* Perform GroupOperation within sub-group */                                \
+  TYPE sg_x = __CLC_APPEND(__clc__Subgroup, CLC_NAME)(op, x, &carry);          \
+  if (scope == Subgroup) {                                                     \
+    return sg_x;                                                               \
+  }                                                                            \
+  __local TYPE *scratch = __CLC_APPEND(__clc__get_group_scratch_, TYPE)();     \
+  uint sg_id = __spirv_SubgroupId();                                           \
+  uint num_sg = __spirv_NumSubgroups();                                        \
+  uint sg_lid = __spirv_SubgroupLocalInvocationId();                           \
+  uint sg_size = __spirv_SubgroupSize();                                       \
+  /* Share carry values across sub-groups */                                   \
+  if (sg_lid == sg_size - 1) {                                                 \
+    scratch[sg_id] = carry;                                                    \
+  }                                                                            \
+  __spirv_ControlBarrier(Workgroup, 0, 0);                                     \
+  /* Perform InclusiveScan over sub-group results */                           \
+  TYPE sg_prefix;                                                              \
+  TYPE sg_aggregate = scratch[0];                                              \
+  _Pragma("unroll") for (int s = 1; s < num_sg; ++s) {                         \
+    if (sg_id == s) {                                                          \
+      sg_prefix = sg_aggregate;                                                \
     }                                                                          \
-    /* For Reduce, broadcast result from final sub-group */                    \
-    /* For Scan, combine results from previous sub-groups */                   \
-    TYPE result;                                                               \
-    if (op == Reduce) {                                                        \
-      result = sg_aggregate;                                                   \
-    } else if (op == InclusiveScan || op == ExclusiveScan) {                   \
-      if (sg_id == 0) {                                                        \
-        result = sg_x;                                                         \
-      } else {                                                                 \
-        result = OP(sg_x, sg_prefix);                                          \
-      }                                                                        \
+    TYPE addend = scratch[s];                                                  \
+    sg_aggregate = OP(sg_aggregate, addend);                                   \
+  }                                                                            \
+  /* For Reduce, broadcast result from final sub-group */                      \
+  /* For Scan, combine results from previous sub-groups */                     \
+  TYPE result;                                                                 \
+  if (op == Reduce) {                                                          \
+    result = sg_aggregate;                                                     \
+  } else if (op == InclusiveScan || op == ExclusiveScan) {                     \
+    if (sg_id == 0) {                                                          \
+      result = sg_x;                                                           \
+    } else {                                                                   \
+      result = OP(sg_x, sg_prefix);                                            \
     }                                                                          \
-    __spirv_ControlBarrier(Workgroup, 0, 0);                                   \
-    return result;                                                             \
+  }                                                                            \
+  __spirv_ControlBarrier(Workgroup, 0, 0);                                     \
+  return result;
+
+#define __CLC_GROUP_COLLECTIVE_OUTER(SPIRV_NAME, CLC_NAME, OP, TYPE, IDENTITY) \
+  _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __CLC_APPEND(                    \
+      __spirv_Group, SPIRV_NAME)(uint scope, uint op, TYPE x) {                \
+    __CLC_GROUP_COLLECTIVE_INNER(CLC_NAME, OP, TYPE, IDENTITY)                 \
   }
 
 #define __CLC_GROUP_COLLECTIVE_4(NAME, OP, TYPE, IDENTITY)                     \
-  __CLC_GROUP_COLLECTIVE_INNER(NAME, NAME, OP, TYPE, IDENTITY)
+  __CLC_GROUP_COLLECTIVE_OUTER(NAME, NAME, OP, TYPE, IDENTITY)
 #define __CLC_GROUP_COLLECTIVE_5(SPIRV_NAME, CLC_NAME, OP, TYPE, IDENTITY)     \
-  __CLC_GROUP_COLLECTIVE_INNER(SPIRV_NAME, CLC_NAME, OP, TYPE, IDENTITY)
+  __CLC_GROUP_COLLECTIVE_OUTER(SPIRV_NAME, CLC_NAME, OP, TYPE, IDENTITY)
 
 #define DISPATCH_TO_CLC_GROUP_COLLECTIVE_MACRO(_1, _2, _3, _4, _5, NAME, ...)  \
   NAME
@@ -338,6 +488,13 @@ __CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, long, 0l)
       __VA_ARGS__, __CLC_GROUP_COLLECTIVE_5, __CLC_GROUP_COLLECTIVE_4)         \
   (__VA_ARGS__)
 
+#define __CLC_GROUP_COLLECTIVE_MANUAL_MANGLE(SPIRV_NAME_MANGLED, CLC_NAME, OP, \
+                                             TYPE, IDENTITY)                   \
+  _CLC_DEF _CLC_CONVERGENT TYPE SPIRV_NAME_MANGLED(uint scope, uint op,        \
+                                                   TYPE x) {                   \
+    __CLC_GROUP_COLLECTIVE_INNER(CLC_NAME, OP, TYPE, IDENTITY)                 \
+  }
+
 __CLC_GROUP_COLLECTIVE(BitwiseOr, __CLC_OR, bool, false);
 __CLC_GROUP_COLLECTIVE(BitwiseAny, __CLC_AND, bool, true);
 _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT bool __spirv_GroupAny(uint scope,
@@ -373,6 +530,16 @@ __CLC_GROUP_COLLECTIVE(FMulKHR, __CLC_MUL, half, 1)
 __CLC_GROUP_COLLECTIVE(FMulKHR, __CLC_MUL, float, 1)
 __CLC_GROUP_COLLECTIVE(FMulKHR, __CLC_MUL, double, 1)
 
+__CLC_GROUP_COLLECTIVE_MANUAL_MANGLE(
+    _Z22__spirv_GroupCMulINTELjjN5__spv12complex_halfE, CMulINTEL,
+    __CLC_COMPLEX_MUL, complex_half, ((complex_half){1, 0}))
+__CLC_GROUP_COLLECTIVE_MANUAL_MANGLE(
+    _Z22__spirv_GroupCMulINTELjjN5__spv13complex_floatE, CMulINTEL,
+    __CLC_COMPLEX_MUL, complex_float, ((complex_float){1, 0}))
+__CLC_GROUP_COLLECTIVE_MANUAL_MANGLE(
+    _Z22__spirv_GroupCMulINTELjjN5__spv14complex_doubleE, CMulINTEL,
+    __CLC_COMPLEX_MUL, complex_double, ((complex_double){1, 0}))
+
 __CLC_GROUP_COLLECTIVE(SMin, __CLC_MIN, char, CHAR_MAX)
 __CLC_GROUP_COLLECTIVE(UMin, __CLC_MIN, uchar, UCHAR_MAX)
 __CLC_GROUP_COLLECTIVE(SMin, __CLC_MIN, short, SHRT_MAX)
diff --git a/libclc/ptx-nvidiacl/libspirv/group/collectives_helpers.ll b/libclc/ptx-nvidiacl/libspirv/group/collectives_helpers.ll
index 6b2e5d8880386..f2c59b3fddd31 100644
--- a/libclc/ptx-nvidiacl/libspirv/group/collectives_helpers.ll
+++ b/libclc/ptx-nvidiacl/libspirv/group/collectives_helpers.ll
@@ -1,61 +1,97 @@
-; 64 storage locations is sufficient for all current-generation NVIDIA GPUs
-; 64 bits per warp is sufficient for all fundamental data types
+; 32 storage locations is sufficient for all current-generation NVIDIA GPUs
+; 128 bits per warp is sufficient for all fundamental data types and complex
 ; Reducing storage for small data types or increasing it for user-defined types
 ; will likely require an additional pass to track group algorithm usage
-@__clc__group_scratch = internal addrspace(3) global [64 x i64] undef, align 1
+@__clc__group_scratch = internal addrspace(3) global [128 x i64] undef, align 1
 
 define i8 addrspace(3)* @__clc__get_group_scratch_bool() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)*
   ret i8 addrspace(3)* %cast
 }
 
 define i8 addrspace(3)* @__clc__get_group_scratch_char() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)*
   ret i8 addrspace(3)* %cast
 }
 
 define i16 addrspace(3)* @__clc__get_group_scratch_short() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to i16 addrspace(3)*
   ret i16 addrspace(3)* %cast
 }
 
 define i32 addrspace(3)* @__clc__get_group_scratch_int() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to i32 addrspace(3)*
   ret i32 addrspace(3)* %cast
 }
 
 define i64 addrspace(3)* @__clc__get_group_scratch_long() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to i64 addrspace(3)*
   ret i64 addrspace(3)* %cast
 }
 
 define half addrspace(3)* @__clc__get_group_scratch_half() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to half addrspace(3)*
   ret half addrspace(3)* %cast
 }
 
 define float addrspace(3)* @__clc__get_group_scratch_float() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to float addrspace(3)*
   ret float addrspace(3)* %cast
 }
 
 define double addrspace(3)* @__clc__get_group_scratch_double() nounwind alwaysinline {
 entry:
-  %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
   %cast = bitcast i64 addrspace(3)* %ptr to double addrspace(3)*
   ret double addrspace(3)* %cast
 }
+
+%complex_half = type {
+  half,
+  half
+}
+
+%complex_float = type {
+  float,
+  float
+}
+
+%complex_double = type {
+  double,
+  double
+}
+
+define %complex_half addrspace(3)* @__clc__get_group_scratch_complex_half() nounwind alwaysinline {
+entry:
+  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %cast = bitcast i64 addrspace(3)* %ptr to %complex_half addrspace(3)*
+  ret %complex_half addrspace(3)* %cast
+}
+
+define %complex_float addrspace(3)* @__clc__get_group_scratch_complex_float() nounwind alwaysinline {
+entry:
+  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %cast = bitcast i64 addrspace(3)* %ptr to %complex_float addrspace(3)*
+  ret %complex_float addrspace(3)* %cast
+}
+
+define %complex_double addrspace(3)* @__clc__get_group_scratch_complex_double() nounwind alwaysinline {
+entry:
+  %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %cast = bitcast i64 addrspace(3)* %ptr to %complex_double addrspace(3)*
+  ret %complex_double addrspace(3)* %cast
+}
diff --git a/libdevice/cmake/modules/SYCLLibdevice.cmake b/libdevice/cmake/modules/SYCLLibdevice.cmake
index 5bb8fef6074f0..75995cc25e946 100644
--- a/libdevice/cmake/modules/SYCLLibdevice.cmake
+++ b/libdevice/cmake/modules/SYCLLibdevice.cmake
@@ -111,7 +111,7 @@ set(complex_obj_deps device_complex.h device.h sycl-compiler)
 set(cmath_obj_deps device_math.h device.h sycl-compiler)
 set(imf_obj_deps device_imf.hpp imf_half.hpp imf_bf16.hpp device.h sycl-compiler)
 set(itt_obj_deps device_itt.h spirv_vars.h device.h sycl-compiler)
-set(bfloat16_obj_deps sycl-compiler)
+set(bfloat16_obj_deps sycl-headers sycl-compiler)
 
 add_devicelib_obj(libsycl-itt-stubs SRC itt_stubs.cpp DEP ${itt_obj_deps})
 add_devicelib_obj(libsycl-itt-compiler-wrappers SRC itt_compiler_wrappers.cpp DEP ${itt_obj_deps})
diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h b/llvm/include/llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h
index 4845ad6e97bf1..f8f1078b0669a 100644
--- a/llvm/include/llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h
+++ b/llvm/include/llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h
@@ -20,13 +20,19 @@ namespace llvm {
 class SYCLPropagateAspectsUsagePass
     : public PassInfoMixin<SYCLPropagateAspectsUsagePass> {
 public:
-  SYCLPropagateAspectsUsagePass(StringRef OptionsString = {}) {
+  SYCLPropagateAspectsUsagePass(std::set<StringRef> ExcludeAspects = {},
+                                bool ValidateAspects = true,
+                                StringRef OptionsString = {})
+      : ExcludedAspects{std::move(ExcludeAspects)},
+        ValidateAspectUsage{ValidateAspects} {
     OptionsString.split(this->TargetFixedAspects, ',', /*MaxSplit=*/-1,
                         /*KeepEmpty=*/false);
   };
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
 
 private:
+  std::set<StringRef> ExcludedAspects;
+  const bool ValidateAspectUsage;
   SmallVector<StringRef, 8> TargetFixedAspects;
 };
 
diff --git a/llvm/lib/SYCLLowerIR/CompileTimeProperties.def b/llvm/lib/SYCLLowerIR/CompileTimeProperties.def
index f1490e3b8e020..54e96c1823988 100644
--- a/llvm/lib/SYCLLowerIR/CompileTimeProperties.def
+++ b/llvm/lib/SYCLLowerIR/CompileTimeProperties.def
@@ -27,7 +27,7 @@ SYCL_COMPILE_TIME_PROPERTY("sycl-dwidth", 6178, DecorValueTy::uint32)
 SYCL_COMPILE_TIME_PROPERTY("sycl-latency", 6179, DecorValueTy::uint32)
 SYCL_COMPILE_TIME_PROPERTY("sycl-read-write-mode", 6180, DecorValueTy::uint32)
 SYCL_COMPILE_TIME_PROPERTY("sycl-maxburst", 6181, DecorValueTy::uint32)
-SYCL_COMPILE_TIME_PROPERTY("sycl-alignment", 6182, DecorValueTy::uint32)
-SYCL_COMPILE_TIME_PROPERTY("sycl-wait-request", 6183, DecorValueTy::uint32)
-SYCL_COMPILE_TIME_PROPERTY("sycl-stable", 6184, DecorValueTy::boolean)
+SYCL_COMPILE_TIME_PROPERTY("sycl-alignment", 44, DecorValueTy::uint32)
+SYCL_COMPILE_TIME_PROPERTY("sycl-wait-request", 6182, DecorValueTy::uint32)
+SYCL_COMPILE_TIME_PROPERTY("sycl-stable", 6183, DecorValueTy::boolean)
 SYCL_COMPILE_TIME_PROPERTY("sycl-strict", 19, DecorValueTy::boolean)
diff --git a/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp b/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp
index 1d7fd512ba868..a04942c8bb447 100644
--- a/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp
+++ b/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp
@@ -337,6 +337,7 @@ PreservedAnalyses CompileTimePropertiesPass::run(Module &M,
     if (F.getCallingConv() != CallingConv::SPIR_KERNEL)
       continue;
 
+    // Compile time properties on kernel arguments
     {
       SmallVector<Metadata *, 8> MDOps;
       MDOps.reserve(F.arg_size());
@@ -345,8 +346,12 @@ PreservedAnalyses CompileTimePropertiesPass::run(Module &M,
         SmallVector<Metadata *, 8> MDArgOps;
         for (auto &Attribute : F.getAttributes().getParamAttrs(I)) {
           if (MDNode *SPIRVMetadata =
-                  attributeToDecorateMetadata(Ctx, Attribute))
+                  attributeToDecorateMetadata(Ctx, Attribute)) {
+            // sycl-alignment is not collected to SPIRV.ParamDecoration
+            if (Attribute.getKindAsString() == "sycl-alignment")
+              continue;
             MDArgOps.push_back(SPIRVMetadata);
+          }
         }
         if (!MDArgOps.empty())
           FoundKernelProperties = true;
diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
index 9479d0ee3b838..e6bd8ffe8a44d 100644
--- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
+++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
@@ -671,7 +671,8 @@ class ESIMDIntrinDescTable {
          {"test.src.tmpl.arg", {t(0), t1(1), t8(2), t16(3), t32(4), c8(17)}}},
         {"slm_init", {"slm.init", {a(0)}}},
         {"bf_cvt", {"bf.cvt", {a(0)}}},
-        {"tf32_cvt", {"tf32.cvt", {a(0)}}}};
+        {"tf32_cvt", {"tf32.cvt", {a(0)}}},
+        {"bfn", {"bfn", {a(0), a(1), a(2), t(0)}}}};
   }
 
   const IntrinTable &getTable() { return Table; }
diff --git a/llvm/lib/SYCLLowerIR/LowerKernelProps.cpp b/llvm/lib/SYCLLowerIR/LowerKernelProps.cpp
index c784c7607112f..5d576b7edfd73 100644
--- a/llvm/lib/SYCLLowerIR/LowerKernelProps.cpp
+++ b/llvm/lib/SYCLLowerIR/LowerKernelProps.cpp
@@ -50,7 +50,8 @@ void processSetKernelPropertiesCall(CallInst &CI) {
       GraphNode->addFnAttr(::sycl::kernel_props::ATTR_LARGE_GRF);
       // Add RegisterAllocMode metadata with arg 2 to the kernel to tell
       // IGC to compile this kernel in large GRF mode. 2 means large.
-      if (GraphNode->getCallingConv() == CallingConv::SPIR_KERNEL) {
+      if (GraphNode->getCallingConv() == CallingConv::SPIR_KERNEL &&
+          !GraphNode->hasMetadata("sycl_explicit_simd")) {
         auto &Ctx = GraphNode->getContext();
         Metadata *AttrMDArgs[] = {ConstantAsMetadata::get(
             Constant::getIntegerValue(Type::getInt32Ty(Ctx), APInt(32, 2)))};
diff --git a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp
index b13e480170fec..0f32fd78b68de 100644
--- a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp
+++ b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp
@@ -53,6 +53,11 @@ static cl::opt<std::string> ClSyclFixedTargets(
              "is expected to be runnable on"),
     cl::Hidden, cl::init(""));
 
+static cl::opt<std::string> ClSyclExcludeAspects(
+    "sycl-propagate-aspects-usage-exclude-aspects",
+    cl::desc("Specify aspects to exclude when propagating aspect usage"),
+    cl::Hidden, cl::init(""));
+
 namespace {
 
 using AspectsSetTy = SmallSet<int, 4>;
@@ -293,15 +298,37 @@ getAspectUsageChain(const Function *F, const FunctionToAspectsMapTy &AspectsMap,
   return CallChain;
 }
 
-void createUsedAspectsMetadataForFunctions(FunctionToAspectsMapTy &Map) {
+void createUsedAspectsMetadataForFunctions(
+    FunctionToAspectsMapTy &Map, const AspectsSetTy &ExcludeAspectVals) {
   for (auto &[F, Aspects] : Map) {
     if (Aspects.empty())
       continue;
 
     LLVMContext &C = F->getContext();
 
+    // Create a set of unique aspects. First we add the ones from the found
+    // aspects that have not been excluded.
+    AspectsSetTy UniqueAspects;
+    for (const int &A : Aspects)
+      if (!ExcludeAspectVals.contains(A))
+        UniqueAspects.insert(A);
+
+    // If there are no new aspects, we can just keep the old metadata.
+    if (UniqueAspects.empty())
+      continue;
+
+    // If there is new metadata, merge it with the old aspects. We preserve
+    // the excluded ones.
+    if (const MDNode *ExistingAspects = F->getMetadata("sycl_used_aspects")) {
+      for (const MDOperand &MDOp : ExistingAspects->operands()) {
+        const Constant *C = cast<ConstantAsMetadata>(MDOp)->getValue();
+        UniqueAspects.insert(cast<ConstantInt>(C)->getSExtValue());
+      }
+    }
+
+    // Create new metadata.
     SmallVector<Metadata *, 16> AspectsMetadata;
-    for (const auto &A : Aspects)
+    for (const int &A : UniqueAspects)
       AspectsMetadata.push_back(ConstantAsMetadata::get(
           ConstantInt::getSigned(Type::getInt32Ty(C), A)));
 
@@ -506,7 +533,8 @@ void setSyclFixedTargetsMD(const std::vector<Function *> &EntryPoints,
 FunctionToAspectsMapTy
 buildFunctionsToAspectsMap(Module &M, TypeToAspectsMapTy &TypesWithAspects,
                            const AspectValueToNameMapTy &AspectValues,
-                           const std::vector<Function *> &EntryPoints) {
+                           const std::vector<Function *> &EntryPoints,
+                           bool ValidateAspects) {
   FunctionToAspectsMapTy FunctionToUsedAspects;
   FunctionToAspectsMapTy FunctionToDeclaredAspects;
   CallGraphTy CG;
@@ -522,8 +550,9 @@ buildFunctionsToAspectsMap(Module &M, TypeToAspectsMapTy &TypesWithAspects,
   for (Function *F : EntryPoints)
     propagateAspectsThroughCG(F, CG, FunctionToUsedAspects, Visited);
 
-  validateUsedAspectsForFunctions(FunctionToUsedAspects, AspectValues,
-                                  EntryPoints, CG);
+  if (ValidateAspects)
+    validateUsedAspectsForFunctions(FunctionToUsedAspects, AspectValues,
+                                    EntryPoints, CG);
 
   // The set of aspects from FunctionToDeclaredAspects should be merged to the
   // set of FunctionToUsedAspects after validateUsedAspectsForFunctions call to
@@ -558,6 +587,14 @@ SYCLPropagateAspectsUsagePass::run(Module &M, ModuleAnalysisManager &MAM) {
     StringRef(ClSyclFixedTargets)
         .split(TargetFixedAspects, ',', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
 
+  if (ClSyclExcludeAspects.getNumOccurrences() > 0) {
+    SmallVector<StringRef, 4> ExcludedAspectsVec;
+    StringRef(ClSyclExcludeAspects)
+        .split(ExcludedAspectsVec, ',', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+    ExcludedAspects.insert(ExcludedAspectsVec.begin(),
+                           ExcludedAspectsVec.end());
+  }
+
   std::vector<Function *> EntryPoints;
   for (Function &F : M.functions())
     if (isEntryPoint(F))
@@ -566,9 +603,19 @@ SYCLPropagateAspectsUsagePass::run(Module &M, ModuleAnalysisManager &MAM) {
   propagateAspectsToOtherTypesInModule(M, TypesWithAspects, AspectValues);
 
   FunctionToAspectsMapTy FunctionToUsedAspects = buildFunctionsToAspectsMap(
-      M, TypesWithAspects, AspectValues, EntryPoints);
+      M, TypesWithAspects, AspectValues, EntryPoints, ValidateAspectUsage);
+
+  // Create a set of excluded aspect values.
+  AspectsSetTy ExcludedAspectVals;
+  for (const StringRef &AspectName : ExcludedAspects) {
+    const auto AspectValIter = AspectValues.find(AspectName);
+    assert(AspectValIter != AspectValues.end() &&
+           "Excluded aspect does not have a corresponding value.");
+    ExcludedAspectVals.insert(AspectValIter->second);
+  }
 
-  createUsedAspectsMetadataForFunctions(FunctionToUsedAspects);
+  createUsedAspectsMetadataForFunctions(FunctionToUsedAspects,
+                                        ExcludedAspectVals);
 
   setSyclFixedTargetsMD(EntryPoints, TargetFixedAspects, AspectValues);
 
diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/double-prop-after-exclude.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/double-prop-after-exclude.ll
new file mode 100644
index 0000000000000..8eb2512b507d6
--- /dev/null
+++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/double-prop-after-exclude.ll
@@ -0,0 +1,48 @@
+; RUN: opt -passes=sycl-propagate-aspects-usage -sycl-propagate-aspects-usage-exclude-aspects=fp64 < %s -S -o %t_first.ll
+; RUN: opt -passes=sycl-propagate-aspects-usage < %t_first.ll -S -o %t_second.ll
+; FileCheck %s --input-file %t_first.ll --check-prefix=CHECK-FIRST
+; FileCheck %s --input-file %t_second.ll --check-prefix=CHECK-SECOND
+;
+; Test checks that fp64 usage is correctly propagate in the two-run model.
+
+%composite = type { double }
+
+; CHECK-FIRST-NOT: spir_kernel void @kernel() {{.*}} !sycl_used_aspects
+; CHECK-SECOND: spir_kernel void @kernel() !sycl_used_aspects ![[MDID:]]
+define spir_kernel void @kernel() {
+  call spir_func void @func()
+  ret void
+}
+
+; CHECK-FIRST-NOT: spir_func void @func() {{.*}} !sycl_used_aspects
+; CHECK-SECOND: spir_func void @func() !sycl_used_aspects ![[MDID]] {
+define spir_func void @func() {
+  %tmp = alloca double
+  ret void
+}
+
+; CHECK-FIRST-NOT: spir_func void @func.array() {{.*}} !sycl_used_aspects
+; CHECK-SECOND: spir_func void @func.array() !sycl_used_aspects ![[MDID]] {
+define spir_func void @func.array() {
+  %tmp = alloca [4 x double]
+  ret void
+}
+
+; CHECK-FIRST-NOT: spir_func void @func.vector() {{.*}} !sycl_used_aspects
+; CHECK-SECOND: spir_func void @func.vector() !sycl_used_aspects ![[MDID]] {
+define spir_func void @func.vector() {
+  %tmp = alloca <4 x double>
+  ret void
+}
+
+; CHECK-FIRST-NOT: spir_func void @func.composite() {{.*}} !sycl_used_aspects
+; CHECK-SECOND: spir_func void @func.composite() !sycl_used_aspects ![[MDID]] {
+define spir_func void @func.composite() {
+  %tmp = alloca %composite
+  ret void
+}
+
+!sycl_aspects = !{!0}
+!0 = !{!"fp64", i32 6}
+
+; CHECK-SECOND: ![[MDID]] = !{i32 6}
diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/exclude-aspect.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/exclude-aspect.ll
new file mode 100644
index 0000000000000..59c7964cd60ad
--- /dev/null
+++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/exclude-aspect.ll
@@ -0,0 +1,139 @@
+; RUN: opt -passes=sycl-propagate-aspects-usage -sycl-propagate-aspects-usage-exclude-aspects=aspect4,aspect1 -S < %s | FileCheck %s
+;
+; Test checks that the pass is able to collect all aspects used in a function
+
+%A = type { i32 }
+%B = type { i32 }
+%C = type { i32 }
+%D = type { i32 }
+
+; None of funcA's aspects are excluded.
+; CHECK: define spir_func void @funcA() !sycl_used_aspects ![[#ID0:]] {
+define spir_func void @funcA() {
+  %tmp = alloca %A
+  ret void
+}
+
+; funcB uses "aspect1" which is excluded, so the resulting aspects are the same
+; as for funcA.
+; CHECK: define spir_func void @funcB() !sycl_used_aspects ![[#ID0]] {
+define spir_func void @funcB() {
+  %tmp = alloca %B
+  call spir_func void @funcA()
+  ret void
+}
+
+; funcC has an aspect excluded, propagated from funcB.
+; CHECK: define spir_func void @funcC() !sycl_used_aspects ![[#ID1:]] {
+define spir_func void @funcC() {
+  %tmp = alloca %C
+  call spir_func void @funcB()
+  ret void
+}
+
+; funcD has two aspects excluded; one from the use of D and one from propagated.
+; from funcB and funcC.
+; CHECK: define spir_func void @funcD() !sycl_used_aspects ![[#ID2:]] {
+define spir_func void @funcD() {
+  %tmp = alloca %D
+  call spir_func void @funcC()
+  ret void
+}
+
+; kernel1 has the same aspects as funcD.
+; CHECK: define spir_kernel void @kernel1() !sycl_used_aspects ![[#ID2]]
+define spir_kernel void @kernel1() {
+  call spir_func void @funcD()
+  ret void
+}
+
+; funcE should get none of its explicitly declared aspects in its
+; sycl_used_aspects
+; CHECK: define spir_func void @funcE() !sycl_declared_aspects ![[#DA1:]] {
+define spir_func void @funcE() !sycl_declared_aspects !10 {
+  ret void
+}
+
+; funcF should have the same aspects as funcE
+; CHECK-NOT: define spir_func void @funcF() {{.*}} !sycl_used_aspects
+define spir_func void @funcF() {
+  call spir_func void @funcE()
+  ret void
+}
+
+; funcG only keeps one aspect, the rest are excluded
+; CHECK: define spir_func void @funcG() !sycl_declared_aspects ![[#DA2:]] !sycl_used_aspects ![[#ID3:]]
+define spir_func void @funcG() !sycl_declared_aspects !11 {
+  ret void
+}
+
+; funcH should have the same aspects as funcG
+; CHECK: define spir_func void @funcH() !sycl_used_aspects ![[#ID3]]
+define spir_func void @funcH() {
+  call spir_func void @funcG()
+  ret void
+}
+
+; CHECK: define spir_kernel void @kernel2() !sycl_used_aspects ![[#ID3]]
+define spir_kernel void @kernel2() {
+  call spir_func void @funcF()
+  call spir_func void @funcH()
+  ret void
+}
+
+; CHECK: define spir_func void @funcI() !sycl_used_aspects ![[#DA1]] {
+define spir_func void @funcI() !sycl_used_aspects !10 {
+  ret void
+}
+
+; CHECK-NOT: define spir_func void @funcJ() {{.*}} !sycl_used_aspects
+define spir_func void @funcJ() {
+  call spir_func void @funcI()
+  ret void
+}
+
+;
+; Note that the listed aspects can be reordered due to the merging of the
+; aspect sets.
+; CHECK: define spir_func void @funcK() !sycl_used_aspects ![[#ID4:]] {
+define spir_func void @funcK() !sycl_used_aspects !11 {
+  ret void
+}
+
+; CHECK: define spir_func void @funcL() !sycl_used_aspects ![[#ID3]]
+define spir_func void @funcL() {
+  call spir_func void @funcK()
+  ret void
+}
+
+; CHECK: define spir_kernel void @kernel3() !sycl_used_aspects ![[#ID3]]
+define spir_kernel void @kernel3() {
+  call spir_func void @funcK()
+  call spir_func void @funcL()
+  ret void
+}
+
+!sycl_types_that_use_aspects = !{!0, !1, !2, !3}
+!0 = !{!"A", i32 0}
+!1 = !{!"B", i32 1}
+!2 = !{!"C", i32 2}
+!3 = !{!"D", i32 3, i32 4}
+
+!sycl_aspects = !{!4, !5, !6, !7, !8, !9}
+!4 = !{!"aspect0", i32 0}
+!5 = !{!"aspect1", i32 1}
+!6 = !{!"aspect2", i32 2}
+!7 = !{!"aspect3", i32 3}
+!8 = !{!"aspect4", i32 4}
+!9 = !{!"fp64", i32 5}
+
+!10 = !{i32 1}
+!11 = !{i32 4, i32 2, i32 1}
+; CHECK-DAG: ![[#DA1]] = !{i32 1}
+; CHECK-DAG: ![[#DA2]] = !{i32 4, i32 2, i32 1}
+
+; CHECK-DAG: ![[#ID0]] = !{i32 0}
+; CHECK-DAG: ![[#ID1]] = !{i32 2, i32 0}
+; CHECK-DAG: ![[#ID2]] = !{i32 0, i32 2, i32 3}
+; CHECK-DAG: ![[#ID3]] = !{i32 2}
+; CHECK-DAG: ![[#ID4]] = !{i32 2, i32 4, i32 1}
diff --git a/llvm/test/SYCLLowerIR/lower_kernel_props.ll b/llvm/test/SYCLLowerIR/lower_kernel_props.ll
index 4d7793c39a887..f2cef0617739c 100644
--- a/llvm/test/SYCLLowerIR/lower_kernel_props.ll
+++ b/llvm/test/SYCLLowerIR/lower_kernel_props.ll
@@ -5,7 +5,7 @@
 ; - remove the intrinsic call 
 ; - mark the kernel with corresponding attribute (only "large-grf" for now)
 
-; RUN: opt -passes=lower-kernel-props -S %s -o - | FileCheck %s
+; RUN: opt -passes=lower-kernel-props -S %s -o - | FileCheck %s --implicit-check-not='RegisterAllocMode'
 
 ; ModuleID = 'large_grf.bc'
 source_filename = "llvm-link"
@@ -33,8 +33,7 @@ define weak_odr dso_local spir_kernel void @__large_grf_kernel1() !sycl_explicit
 
 ; -- This kernel calls the marker function directly
 define weak_odr dso_local spir_kernel void @__large_grf_kernel2() #0 !sycl_explicit_simd !0 !intel_reqd_sub_group_size !1 {
-; CHECK: {{.*}} spir_kernel void @__large_grf_kernel2() #0 {{.*}} !RegisterAllocMode ![[MetadataArg:[0-9]+]]
-; CHECK: ![[MetadataArg]] = !{i32 2}
+; CHECK: {{.*}} spir_kernel void @__large_grf_kernel2() #0 {{.*}}
   call spir_func void @_Z28__sycl_set_kernel_propertiesi(i32 noundef 0)
   ret void
 }
diff --git a/llvm/test/tools/sycl-post-link/sycl-esimd-large-grf.ll b/llvm/test/tools/sycl-post-link/sycl-esimd-large-grf.ll
index 413380ae70432..919eabccfa3c4 100644
--- a/llvm/test/tools/sycl-post-link/sycl-esimd-large-grf.ll
+++ b/llvm/test/tools/sycl-post-link/sycl-esimd-large-grf.ll
@@ -9,7 +9,7 @@
 
 ; RUN: sycl-post-link -split=source -symbols -split-esimd -lower-esimd -S < %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t.table
-; RUN: FileCheck %s -input-file=%t_esimd_large_grf_0.ll --check-prefixes CHECK-ESIMD-LargeGRF-IR
+; RUN: FileCheck %s -input-file=%t_esimd_large_grf_0.ll --check-prefixes CHECK-ESIMD-LargeGRF-IR  --implicit-check-not='RegisterAllocMode'
 ; RUN: FileCheck %s -input-file=%t_esimd_large_grf_0.prop --check-prefixes CHECK-ESIMD-LargeGRF-PROP
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYCL-SYM
 ; RUN: FileCheck %s -input-file=%t_esimd_1.sym --check-prefixes CHECK-ESIMD-SYM
@@ -58,8 +58,7 @@ entry:
 declare dso_local spir_func void @_Z28__sycl_set_kernel_propertiesi(i32 noundef)
 
 define weak_odr dso_local spir_kernel void @__ESIMD_large_grf_kernel() #0 !sycl_explicit_simd !0 !intel_reqd_sub_group_size !1 {
-; CHECK-ESIMD-LargeGRF-IR: @__ESIMD_large_grf_kernel() {{.*}} !RegisterAllocMode ![[MetadataArg:[0-9]+]]
-; CHECK-ESIMD-LargeGRF-IR: ![[MetadataArg]] = !{i32 2}
+; CHECK-ESIMD-LargeGRF-IR: @__ESIMD_large_grf_kernel() {{.*}}
 entry:
   call spir_func void @_Z17large_grf_markerv()
   ret void
diff --git a/llvm/test/tools/sycl-post-link/sycl-kernel-arg-annotations.ll b/llvm/test/tools/sycl-post-link/sycl-kernel-arg-annotations.ll
index f6cec2be141ef..8f04701bfcac8 100644
--- a/llvm/test/tools/sycl-post-link/sycl-kernel-arg-annotations.ll
+++ b/llvm/test/tools/sycl-post-link/sycl-kernel-arg-annotations.ll
@@ -29,9 +29,8 @@ entry:
 !1587 = !{i32 -1}
 !1588 = !{i32 -1, i32 -1, i32 -1}
 ; CHECK-DAG: ![[PARMDECOR_CASE1]] = !{![[ARG:[0-9]+]]}
-; CHECK-DAG: ![[ARG]] = !{![[ALIGN:[0-9]+]], ![[AWIDTH:[0-9]+]], ![[BL:[0-9]+]], ![[CONDUIT:[0-9]+]], ![[DWIDTH:[0-9]+]], ![[LATENCY:[0-9]+]], ![[MAXBURST:[0-9]+]], ![[RWMODE:[0-9]+]], ![[REGMAP:[0-9]+]], ![[STABLE:[0-9]+]], ![[STRICT:[0-9]+]], ![[WAITREQ:[0-9]+]]}
+; CHECK-DAG: ![[ARG]] = !{![[AWIDTH:[0-9]+]], ![[BL:[0-9]+]], ![[CONDUIT:[0-9]+]], ![[DWIDTH:[0-9]+]], ![[LATENCY:[0-9]+]], ![[MAXBURST:[0-9]+]], ![[RWMODE:[0-9]+]], ![[REGMAP:[0-9]+]], ![[STABLE:[0-9]+]], ![[STRICT:[0-9]+]], ![[WAITREQ:[0-9]+]]}
 
-; CHECK: ![[ALIGN]]   = !{i32 6182, i32 4}
 ; CHECK: ![[AWIDTH]]  = !{i32 6177, i32 32}
 ; CHECK: ![[BL]]      = !{i32 5921, i32 10}
 ; CHECK: ![[CONDUIT]] = !{i32 6175, i32 1}
@@ -40,13 +39,11 @@ entry:
 ; CHECK: ![[MAXBURST]] = !{i32 6181, i32 3}
 ; CHECK: ![[RWMODE]]  = !{i32 6180, i32 2}
 ; CHECK: ![[REGMAP]]  = !{i32 6176, i32 1}
-; CHECK: ![[STABLE]]  = !{i32 6184, i32 1}
+; CHECK: ![[STABLE]]  = !{i32 6183, i32 1}
 ; CHECK: ![[STRICT]]  = !{i32 19, i32 1}
-; CHECK: ![[WAITREQ]] = !{i32 6183, i32 5}
+; CHECK: ![[WAITREQ]] = !{i32 6182, i32 5}
 
-; CHECK-DAG: ![[PARMDECOR_CASE2]] = !{![[ARG1:[0-9]+]], ![[ARG2:[0-9]+]], ![[ARG3:[0-9]+]]}
-; CHECK-DAG: ![[ARG1]] = !{![[ALIGN1:[0-9]+]]}
-; CHECK: ![[ALIGN1]]   = !{i32 6182, i32 8}
-; CHECK-DAG: ![[ARG2]] = !{}
+; CHECK-DAG: ![[PARMDECOR_CASE2]] = !{![[ARG1:[0-9]+]], ![[ARG1:[0-9]+]], ![[ARG3:[0-9]+]]}
+; CHECK-DAG: ![[ARG1]] = !{}
 ; CHECK-DAG: ![[ARG3]] = !{![[AWIDTH3:[0-9]+]]}
 ; CHECK: ![[AWIDTH3]]  = !{i32 6177, i32 64}
diff --git a/opencl/CMakeLists.txt b/opencl/CMakeLists.txt
index 1442a1ac43075..4ed26bd5e719d 100644
--- a/opencl/CMakeLists.txt
+++ b/opencl/CMakeLists.txt
@@ -44,7 +44,7 @@ FetchContent_GetProperties(ocl-headers)
 set(OpenCL_INCLUDE_DIR
   ${ocl-headers_SOURCE_DIR} CACHE PATH "Path to OpenCL Headers")
 
-target_compile_definitions(Headers INTERFACE -DCL_TARGET_OPENCL_VERSION=220)
+target_compile_definitions(Headers INTERFACE -DCL_TARGET_OPENCL_VERSION=300)
 add_library(OpenCL-Headers ALIAS Headers)
 
 # OpenCL Library (ICD Loader)
diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt
index bb3cf0476c689..c4ab71454af84 100644
--- a/sycl/CMakeLists.txt
+++ b/sycl/CMakeLists.txt
@@ -162,6 +162,12 @@ endif()
 if ("esimd_emulator" IN_LIST SYCL_ENABLE_PLUGINS)
   set(SYCL_BUILD_PI_ESIMD_EMULATOR ON)
 endif()
+if ("opencl" IN_LIST SYCL_ENABLE_PLUGINS)
+  set(SYCL_BUILD_PI_OPENCL ON)
+endif()
+if ("level_zero" IN_LIST SYCL_ENABLE_PLUGINS)
+  set(SYCL_BUILD_PI_LEVEL_ZERO ON)
+endif()
 
 # Configure SYCL version macro
 set(sycl_inc_dir ${CMAKE_CURRENT_SOURCE_DIR}/include)
diff --git a/sycl/doc/design/KernelProgramCache.md b/sycl/doc/design/KernelProgramCache.md
index fa46ab6e70289..7818706314c1e 100644
--- a/sycl/doc/design/KernelProgramCache.md
+++ b/sycl/doc/design/KernelProgramCache.md
@@ -215,7 +215,7 @@ kernel of this program will be cached also.
 All requests to build a program or to create a kernel - whether they originate
 from explicit user API calls or from internal SYCL runtime execution logic - end
 up with calling the function
-[`getOrBuild()`](https://github.com/intel/llvm/blob/sycl/sycl/source/detail/program_manager/program_manager.cpp#L149)
+[`getOrBuild()`](https://github.com/intel/llvm/blob/sycl/sycl/source/detail/program_manager/program_manager.cpp)
 with number of lambda functions passed as arguments:
 
 - Acquire function;
@@ -225,7 +225,7 @@ with number of lambda functions passed as arguments:
 *Acquire* function returns a locked version of cache. Locking is employed for
 thread safety. The threads are blocked only for insert-or-acquire attempt, i.e.
 when calling to `map::insert` in
-[`getOrBuild`](https://github.com/intel/llvm/blob/sycl/sycl/source/detail/program_manager/program_manager.cpp#L149)
+[`getOrBuild()`](https://github.com/intel/llvm/blob/sycl/sycl/source/detail/program_manager/program_manager.cpp)
 function. The rest of operation is done with the help of atomics and condition
 variables (plus a mutex for proper work of condition variable).
 
diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc
index b2dfa639b3f75..fdf3ced15afde 100644
--- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc
+++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc
@@ -44,7 +44,7 @@ This extension also depends on the following other SYCL extensions:
 
 * link:../experimental/sycl_ext_oneapi_properties.asciidoc[
   sycl_ext_oneapi_properties]
-* link:../proposed/sycl_ext_oneapi_kernel_properties.asciidoc[
+* link:../experimental/sycl_ext_oneapi_kernel_properties.asciidoc[
   sycl_ext_oneapi_kernel_properties]
 
 
diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_weak_object.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_weak_object.asciidoc
index 77f3b8cf179c8..1b58d93091c47 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_weak_object.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_weak_object.asciidoc
@@ -330,3 +330,5 @@ they are both empty `weak_object` instances.
 
 |===
 
+The `weak_object` class, the `ext_oneapi_owner_before` member functions and the
+`owner_less` function object type must not be used in device code.
diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
index 610a400af48d4..9cd947c21e8d7 100644
--- a/sycl/include/CL/__spirv/spirv_ops.hpp
+++ b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -8,6 +8,7 @@
 
 #pragma once
 #include <CL/__spirv/spirv_types.hpp>
+#include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/detail/defines.hpp>
@@ -1061,6 +1062,19 @@ __CLC_BF16_SCAL_VEC(uint32_t)
 #undef __CLC_BF16_SCAL_VEC
 #undef __CLC_BF16
 
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL
+    __SYCL_EXPORT __spv::complex_half
+    __spirv_GroupCMulINTEL(unsigned int, unsigned int,
+                           __spv::complex_half) noexcept;
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL
+    __SYCL_EXPORT __spv::complex_float
+    __spirv_GroupCMulINTEL(unsigned int, unsigned int,
+                           __spv::complex_float) noexcept;
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL
+    __SYCL_EXPORT __spv::complex_double
+    __spirv_GroupCMulINTEL(unsigned int, unsigned int,
+                           __spv::complex_double) noexcept;
+
 extern __DPCPP_SYCL_EXTERNAL int32_t __spirv_BuiltInGlobalHWThreadIDINTEL();
 extern __DPCPP_SYCL_EXTERNAL int32_t __spirv_BuiltInSubDeviceIDINTEL();
 
diff --git a/sycl/include/CL/__spirv/spirv_types.hpp b/sycl/include/CL/__spirv/spirv_types.hpp
index 89cac7a9d2e07..82c5a39c1500d 100644
--- a/sycl/include/CL/__spirv/spirv_types.hpp
+++ b/sycl/include/CL/__spirv/spirv_types.hpp
@@ -8,9 +8,12 @@
 
 #pragma once
 
+#include "sycl/half_type.hpp"
 #include <sycl/detail/defines.hpp>
 #include <sycl/detail/defines_elementary.hpp>
+#include <sycl/half_type.hpp>
 
+#include <complex>
 #include <cstddef>
 #include <cstdint>
 
@@ -128,6 +131,27 @@ enum class MatrixLayout : uint32_t {
 
 enum class MatrixUse : uint32_t { MatrixA = 0, MatrixB = 1, Accumulator = 2 };
 
+struct complex_float {
+  complex_float() = default;
+  complex_float(std::complex<float> x) : real(x.real()), imag(x.imag()) {}
+  operator std::complex<float>() { return {real, imag}; }
+  float real, imag;
+};
+
+struct complex_double {
+  complex_double() = default;
+  complex_double(std::complex<double> x) : real(x.real()), imag(x.imag()) {}
+  operator std::complex<double>() { return {real, imag}; }
+  double real, imag;
+};
+
+struct complex_half {
+  complex_half() = default;
+  complex_half(std::complex<sycl::half> x) : real(x.real()), imag(x.imag()) {}
+  operator std::complex<sycl::half>() { return {real, imag}; }
+  sycl::half real, imag;
+};
+
 #if (SYCL_EXT_ONEAPI_MATRIX_VERSION > 1)
 template <typename T, std::size_t R, std::size_t C, MatrixLayout L,
           Scope::Flag S = Scope::Flag::Subgroup,
diff --git a/sycl/include/sycl/accessor.hpp b/sycl/include/sycl/accessor.hpp
index 5911d55530300..c4ddf817171a1 100644
--- a/sycl/include/sycl/accessor.hpp
+++ b/sycl/include/sycl/accessor.hpp
@@ -2089,8 +2089,9 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(accessor) accessor :
   }
 
   template <access::target AccessTarget_ = AccessTarget,
-            typename = detail::enable_if_t<AccessTarget_ ==
-                                           access::target::host_buffer>>
+            typename = detail::enable_if_t<
+                (AccessTarget_ == access::target::host_buffer) ||
+                (AccessTarget_ == access::target::host_task)>>
 #if SYCL_LANGUAGE_VERSION >= 202001
   std::add_pointer_t<value_type> get_pointer() const noexcept
 #else
@@ -2667,10 +2668,6 @@ class __SYCL_SPECIAL_CLASS local_accessor_base :
     return AccessorSubscript<Dims - 1>(*this, Index);
   }
 
-  local_ptr<DataT> get_pointer() const {
-    return local_ptr<DataT>(getQualifiedPtr());
-  }
-
   bool operator==(const local_accessor_base &Rhs) const {
     return impl == Rhs.impl;
   }
@@ -2699,6 +2696,11 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS accessor<
   // Use base classes constructors
   using local_acc::local_acc;
 
+public:
+  local_ptr<DataT> get_pointer() const {
+    return local_ptr<DataT>(local_acc::getQualifiedPtr());
+  }
+
 #ifdef __SYCL_DEVICE_ONLY__
 
   // __init needs to be defined within the class not through inheritance.
@@ -2805,6 +2807,10 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor
     return const_reverse_iterator(begin());
   }
 
+  std::add_pointer_t<value_type> get_pointer() const noexcept {
+    return std::add_pointer_t<value_type>(local_acc::getQualifiedPtr());
+  }
+
   template <access::decorated IsDecorated>
   accessor_ptr<IsDecorated> get_multi_ptr() const noexcept {
     return accessor_ptr<IsDecorated>(local_acc::getQualifiedPtr());
diff --git a/sycl/include/sycl/builtins.hpp b/sycl/include/sycl/builtins.hpp
index fe6eeaaec2e4e..9e808e60ddebc 100644
--- a/sycl/include/sycl/builtins.hpp
+++ b/sycl/include/sycl/builtins.hpp
@@ -122,10 +122,27 @@ __SYCL_MATH_FUNCTION_OVERLOAD_FM(log2)
 __SYCL_MATH_FUNCTION_OVERLOAD_FM(log10)
 __SYCL_MATH_FUNCTION_OVERLOAD_FM(sqrt)
 __SYCL_MATH_FUNCTION_OVERLOAD_FM(rsqrt)
+__SYCL_MATH_FUNCTION_OVERLOAD_FM(fabs)
 
 #undef __SYCL_MATH_FUNCTION_OVERLOAD_FM
 #undef __SYCL_MATH_FUNCTION_OVERLOAD_IMPL
 
+template <typename T, size_t N>
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<detail::is_sgenfloat<T>::value, marray<int, N>>
+    ilogb(marray<T, N> x) __NOEXC {
+  marray<int, N> res;
+  for (size_t i = 0; i < N / 2; i++) {
+    vec<int, 2> partial_res =
+        __sycl_std::__invoke_ilogb<vec<int, 2>>(detail::to_vec2(x, i * 2));
+    std::memcpy(&res[i * 2], &partial_res, sizeof(vec<int, 2>));
+  }
+  if (N % 2) {
+    res[N - 1] = __sycl_std::__invoke_ilogb<int>(x[N - 1]);
+  }
+  return res;
+}
+
 #define __SYCL_MATH_FUNCTION_2_OVERLOAD_IMPL(NAME)                             \
   marray<T, N> res;                                                            \
   for (size_t i = 0; i < N / 2; i++) {                                         \
@@ -170,6 +187,98 @@ inline __SYCL_ALWAYS_INLINE
 
 #undef __SYCL_MATH_FUNCTION_2_OVERLOAD_IMPL
 
+#define __SYCL_MATH_FUNCTION_2_SGENFLOAT_Y_OVERLOAD(NAME)                      \
+  template <typename T, size_t N>                                              \
+  inline __SYCL_ALWAYS_INLINE                                                  \
+      std::enable_if_t<detail::is_sgenfloat<T>::value, marray<T, N>>           \
+      NAME(marray<T, N> x, T y) __NOEXC {                                      \
+    marray<T, N> res;                                                          \
+    sycl::vec<T, 2> y_vec{y, y};                                               \
+    for (size_t i = 0; i < N / 2; i++) {                                       \
+      auto partial_res = __sycl_std::__invoke_##NAME<vec<T, 2>>(               \
+          detail::to_vec2(x, i * 2), y_vec);                                   \
+      std::memcpy(&res[i * 2], &partial_res, sizeof(vec<T, 2>));               \
+    }                                                                          \
+    if (N % 2) {                                                               \
+      res[N - 1] = __sycl_std::__invoke_##NAME<T>(x[N - 1], y_vec[0]);         \
+    }                                                                          \
+    return res;                                                                \
+  }
+
+__SYCL_MATH_FUNCTION_2_SGENFLOAT_Y_OVERLOAD(fmax)
+    // clang-format off
+__SYCL_MATH_FUNCTION_2_SGENFLOAT_Y_OVERLOAD(fmin)
+
+#undef __SYCL_MATH_FUNCTION_2_SGENFLOAT_Y_OVERLOAD
+
+template <typename T, size_t N>
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<detail::is_sgenfloat<T>::value, marray<T, N>>
+    ldexp(marray<T, N> x, marray<int, N> k) __NOEXC {
+  // clang-format on
+  marray<T, N> res;
+  for (size_t i = 0; i < N; i++) {
+    res[i] = __sycl_std::__invoke_ldexp<T>(x[i], k[i]);
+  }
+  return res;
+}
+
+template <typename T, size_t N>
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<detail::is_sgenfloat<T>::value, marray<T, N>>
+    ldexp(marray<T, N> x, int k) __NOEXC {
+  marray<T, N> res;
+  for (size_t i = 0; i < N; i++) {
+    res[i] = __sycl_std::__invoke_ldexp<T>(x[i], k);
+  }
+  return res;
+}
+
+#define __SYCL_MATH_FUNCTION_2_GENINT_Y_OVERLOAD_IMPL(NAME)                    \
+  marray<T, N> res;                                                            \
+  for (size_t i = 0; i < N; i++) {                                             \
+    res[i] = __sycl_std::__invoke_##NAME<T>(x[i], y[i]);                       \
+  }                                                                            \
+  return res;
+
+template <typename T, size_t N>
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<detail::is_sgenfloat<T>::value, marray<T, N>>
+    pown(marray<T, N> x, marray<int, N> y) __NOEXC {
+  __SYCL_MATH_FUNCTION_2_GENINT_Y_OVERLOAD_IMPL(pown)
+}
+
+template <typename T, size_t N>
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<detail::is_sgenfloat<T>::value, marray<T, N>>
+    rootn(marray<T, N> x, marray<int, N> y) __NOEXC {
+  __SYCL_MATH_FUNCTION_2_GENINT_Y_OVERLOAD_IMPL(rootn)
+}
+
+#undef __SYCL_MATH_FUNCTION_2_GENINT_Y_OVERLOAD_IMPL
+
+#define __SYCL_MATH_FUNCTION_2_INT_Y_OVERLOAD_IMPL(NAME)                       \
+  marray<T, N> res;                                                            \
+  for (size_t i = 0; i < N; i++) {                                             \
+    res[i] = __sycl_std::__invoke_##NAME<T>(x[i], y);                          \
+  }                                                                            \
+  return res;
+
+template <typename T, size_t N>
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<detail::is_sgenfloat<T>::value, marray<T, N>>
+    pown(marray<T, N> x, int y) __NOEXC {
+  __SYCL_MATH_FUNCTION_2_INT_Y_OVERLOAD_IMPL(pown)
+}
+
+template <typename T, size_t N>
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<detail::is_sgenfloat<T>::value, marray<T, N>>
+    rootn(marray<T, N> x,
+          int y) __NOEXC{__SYCL_MATH_FUNCTION_2_INT_Y_OVERLOAD_IMPL(rootn)}
+
+#undef __SYCL_MATH_FUNCTION_2_INT_Y_OVERLOAD_IMPL
+
 #define __SYCL_MATH_FUNCTION_3_OVERLOAD(NAME)                                  \
   template <typename T, size_t N>                                              \
   inline __SYCL_ALWAYS_INLINE                                                  \
@@ -789,6 +898,80 @@ detail::enable_if_t<detail::is_svgenfloat<T>::value, T> sign(T x) __NOEXC {
   return __sycl_std::__invoke_sign<T>(x);
 }
 
+// marray common functions
+
+// TODO: can be optimized in the way math functions are optimized (usage of
+// vec<T, 2>)
+#define __SYCL_MARRAY_COMMON_FUNCTION_OVERLOAD_IMPL(NAME, ...)                 \
+  T res;                                                                       \
+  for (int i = 0; i < T::size(); i++) {                                        \
+    res[i] = NAME(__VA_ARGS__);                                                \
+  }                                                                            \
+  return res;
+
+#define __SYCL_MARRAY_COMMON_FUNCTION_UNOP_OVERLOAD(NAME, ARG, ...)            \
+  template <typename T,                                                        \
+            typename = std::enable_if_t<detail::is_mgenfloat<T>::value>>       \
+  T NAME(ARG) __NOEXC {                                                        \
+    __SYCL_MARRAY_COMMON_FUNCTION_OVERLOAD_IMPL(NAME, __VA_ARGS__)             \
+  }
+
+__SYCL_MARRAY_COMMON_FUNCTION_UNOP_OVERLOAD(degrees, T radians, radians[i])
+__SYCL_MARRAY_COMMON_FUNCTION_UNOP_OVERLOAD(radians, T degrees, degrees[i])
+__SYCL_MARRAY_COMMON_FUNCTION_UNOP_OVERLOAD(sign, T x, x[i])
+
+#undef __SYCL_MARRAY_COMMON_FUNCTION_UNOP_OVERLOAD
+
+#define __SYCL_MARRAY_COMMON_FUNCTION_BINOP_OVERLOAD(NAME, ARG1, ARG2, ...)    \
+  template <typename T,                                                        \
+            typename = std::enable_if_t<detail::is_mgenfloat<T>::value>>       \
+  T NAME(ARG1, ARG2) __NOEXC {                                                 \
+    __SYCL_MARRAY_COMMON_FUNCTION_OVERLOAD_IMPL(NAME, __VA_ARGS__)             \
+  }
+
+// min and max may be defined as macros, so we wrap them in parentheses to avoid
+// errors.
+__SYCL_MARRAY_COMMON_FUNCTION_BINOP_OVERLOAD((min), T x, T y, x[i], y[i])
+__SYCL_MARRAY_COMMON_FUNCTION_BINOP_OVERLOAD((min), T x,
+                                             detail::marray_element_type<T> y,
+                                             x[i], y)
+__SYCL_MARRAY_COMMON_FUNCTION_BINOP_OVERLOAD((max), T x, T y, x[i], y[i])
+__SYCL_MARRAY_COMMON_FUNCTION_BINOP_OVERLOAD((max), T x,
+                                             detail::marray_element_type<T> y,
+                                             x[i], y)
+__SYCL_MARRAY_COMMON_FUNCTION_BINOP_OVERLOAD(step, T edge, T x, edge[i], x[i])
+__SYCL_MARRAY_COMMON_FUNCTION_BINOP_OVERLOAD(
+    step, detail::marray_element_type<T> edge, T x, edge, x[i])
+
+#undef __SYCL_MARRAY_COMMON_FUNCTION_BINOP_OVERLOAD
+
+#define __SYCL_MARRAY_COMMON_FUNCTION_TEROP_OVERLOAD(NAME, ARG1, ARG2, ARG3,   \
+                                                     ...)                      \
+  template <typename T,                                                        \
+            typename = std::enable_if_t<detail::is_mgenfloat<T>::value>>       \
+  T NAME(ARG1, ARG2, ARG3) __NOEXC {                                           \
+    __SYCL_MARRAY_COMMON_FUNCTION_OVERLOAD_IMPL(NAME, __VA_ARGS__)             \
+  }
+
+__SYCL_MARRAY_COMMON_FUNCTION_TEROP_OVERLOAD(clamp, T x, T minval, T maxval,
+                                             x[i], minval[i], maxval[i])
+__SYCL_MARRAY_COMMON_FUNCTION_TEROP_OVERLOAD(
+    clamp, T x, detail::marray_element_type<T> minval,
+    detail::marray_element_type<T> maxval, x[i], minval, maxval)
+__SYCL_MARRAY_COMMON_FUNCTION_TEROP_OVERLOAD(mix, T x, T y, T a, x[i], y[i],
+                                             a[i])
+__SYCL_MARRAY_COMMON_FUNCTION_TEROP_OVERLOAD(mix, T x, T y,
+                                             detail::marray_element_type<T> a,
+                                             x[i], y[i], a)
+__SYCL_MARRAY_COMMON_FUNCTION_TEROP_OVERLOAD(smoothstep, T edge0, T edge1, T x,
+                                             edge0[i], edge1[i], x[i])
+__SYCL_MARRAY_COMMON_FUNCTION_TEROP_OVERLOAD(
+    smoothstep, detail::marray_element_type<T> edge0,
+    detail::marray_element_type<T> edge1, T x, edge0, edge1, x[i])
+
+#undef __SYCL_MARRAY_COMMON_FUNCTION_TEROP_OVERLOAD
+#undef __SYCL_MARRAY_COMMON_FUNCTION_OVERLOAD_IMPL
+
 /* --------------- 4.13.4 Integer functions. --------------------------------*/
 // ugeninteger abs (geninteger x)
 template <typename T>
@@ -1433,8 +1616,7 @@ any(T x) __NOEXC {
 
 // int any (vigeninteger x)
 template <typename T>
-detail::enable_if_t<detail::is_vigeninteger<T>::value, detail::anyall_ret_t>
-any(T x) __NOEXC {
+detail::enable_if_t<detail::is_vigeninteger<T>::value, int> any(T x) __NOEXC {
   return detail::rel_sign_bit_test_ret_t<T>(
       __sycl_std::__invoke_Any<detail::rel_sign_bit_test_ret_t<T>>(
           detail::rel_sign_bit_test_arg_t<T>(x)));
@@ -1449,8 +1631,7 @@ all(T x) __NOEXC {
 
 // int all (vigeninteger x)
 template <typename T>
-detail::enable_if_t<detail::is_vigeninteger<T>::value, detail::anyall_ret_t>
-all(T x) __NOEXC {
+detail::enable_if_t<detail::is_vigeninteger<T>::value, int> all(T x) __NOEXC {
   return detail::rel_sign_bit_test_ret_t<T>(
       __sycl_std::__invoke_All<detail::rel_sign_bit_test_ret_t<T>>(
           detail::rel_sign_bit_test_arg_t<T>(x)));
@@ -1724,6 +1905,7 @@ __SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(cos)
 __SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(tan)
 __SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(exp)
 __SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(exp2)
+__SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(exp10)
 __SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(log)
 __SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(log2)
 __SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(log10)
diff --git a/sycl/include/sycl/detail/cl.h b/sycl/include/sycl/detail/cl.h
index 7e90fe126e40d..20d640bcff59f 100644
--- a/sycl/include/sycl/detail/cl.h
+++ b/sycl/include/sycl/detail/cl.h
@@ -9,9 +9,9 @@
 #pragma once
 
 // Suppress a compiler message about undefined CL_TARGET_OPENCL_VERSION
-// and define all symbols up to OpenCL 2.2
+// and define all symbols up to OpenCL 3.0
 #ifndef CL_TARGET_OPENCL_VERSION
-#define CL_TARGET_OPENCL_VERSION 220
+#define CL_TARGET_OPENCL_VERSION 300
 #endif
 
 #include <CL/cl.h>
diff --git a/sycl/include/sycl/detail/generic_type_traits.hpp b/sycl/include/sycl/detail/generic_type_traits.hpp
index 09e1bb4850139..a8c479b816562 100644
--- a/sycl/include/sycl/detail/generic_type_traits.hpp
+++ b/sycl/include/sycl/detail/generic_type_traits.hpp
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <CL/__spirv/spirv_types.hpp>
 #include <sycl/access/access.hpp>
 #include <sycl/aliases.hpp>
 #include <sycl/detail/common.hpp>
@@ -16,6 +17,7 @@
 #include <sycl/half_type.hpp>
 #include <sycl/multi_ptr.hpp>
 
+#include <complex>
 #include <limits>
 
 namespace sycl {
@@ -446,6 +448,14 @@ using select_cl_scalar_float_t =
     select_apply_cl_scalar_t<T, std::false_type, sycl::opencl::cl_half,
                              sycl::opencl::cl_float, sycl::opencl::cl_double>;
 
+template <typename T>
+using select_cl_scalar_complex_or_T_t = std::conditional_t<
+    std::is_same<T, std::complex<float>>::value, __spv::complex_float,
+    std::conditional_t<
+        std::is_same<T, std::complex<double>>::value, __spv::complex_double,
+        std::conditional_t<std::is_same<T, std::complex<half>>::value,
+                           __spv::complex_half, T>>>;
+
 template <typename T>
 using select_cl_scalar_integral_t =
     conditional_t<std::is_signed<T>::value,
@@ -457,12 +467,13 @@ using select_cl_scalar_integral_t =
 template <typename T>
 using select_cl_scalar_t = conditional_t<
     std::is_integral<T>::value, select_cl_scalar_integral_t<T>,
-    conditional_t<
-        std::is_floating_point<T>::value, select_cl_scalar_float_t<T>,
-        // half is a special case: it is implemented differently on host and
-        // device and therefore, might lower to different types
-        conditional_t<std::is_same<T, half>::value,
-                      sycl::detail::half_impl::BIsRepresentationT, T>>>;
+    conditional_t<std::is_floating_point<T>::value, select_cl_scalar_float_t<T>,
+                  // half is a special case: it is implemented differently on
+                  // host and device and therefore, might lower to different
+                  // types
+                  conditional_t<std::is_same<T, half>::value,
+                                sycl::detail::half_impl::BIsRepresentationT,
+                                select_cl_scalar_complex_or_T_t<T>>>>;
 
 // select_cl_vector_or_scalar_or_ptr does cl_* type selection for element type
 // of a vector type T, pointer type substitution, and scalar type substitution.
diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp
index f5563c28ad12e..474ced1a5fe91 100644
--- a/sycl/include/sycl/detail/type_traits.hpp
+++ b/sycl/include/sycl/detail/type_traits.hpp
@@ -65,8 +65,10 @@ template <typename ElementType, access::address_space Space,
 class multi_ptr;
 
 template <class T>
-inline constexpr bool is_group_v =
-    detail::is_group<T>::value || detail::is_sub_group<T>::value;
+struct is_group : std::bool_constant<detail::is_group<T>::value ||
+                                     detail::is_sub_group<T>::value> {};
+
+template <class T> inline constexpr bool is_group_v = is_group<T>::value;
 
 namespace ext::oneapi::experimental {
 template <class T>
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp
index d6da54db85118..bed0f2dcd1d14 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp
@@ -714,6 +714,19 @@ __esimd_dpasw_nosrc0(__ESIMD_DNS::vector_type_t<T1, N1> src1,
 }
 #endif // !__SYCL_DEVICE_ONLY__
 
+template <uint8_t FuncControl, typename T, int N>
+__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N)
+    __esimd_bfn(__ESIMD_raw_vec_t(T, N) src0, __ESIMD_raw_vec_t(T, N) src1,
+                __ESIMD_raw_vec_t(T, N) src2)
+#ifdef __SYCL_DEVICE_ONLY__
+        ;
+#else  // !__SYCL_DEVICE_ONLY__
+{
+  __ESIMD_UNSUPPORTED_ON_HOST;
+  return __ESIMD_DNS::vector_type_t<T, N>();
+}
+#endif // !__SYCL_DEVICE_ONLY__
+
 #undef __ESIMD_raw_vec_t
 #undef __ESIMD_cpp_vec_t
 
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp
index 7783b13a3d064..e337e12c66ebf 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp
@@ -1727,6 +1727,106 @@ __ESIMD_API __ESIMD_NS::simd<T, N> dpasw2(
 }
 /// @} sycl_esimd_systolic_array_api
 
+/// @addtogroup sycl_esimd_logical
+/// @{
+
+/// This enum is used to encode all possible logical operations performed
+/// on the 3 input operands. It is used as a template argument of the bfn()
+/// function.
+/// Example: d = bfn<~bfn_t::x & ~bfn_t::y & ~bfn_t::z>(s0, s1, s2);
+enum class bfn_t : uint8_t { x = 0xAA, y = 0xCC, z = 0xF0 };
+
+static constexpr bfn_t operator~(bfn_t x) {
+  uint8_t val = static_cast<uint8_t>(x);
+  uint8_t res = ~val;
+  return static_cast<bfn_t>(res);
+}
+
+static constexpr bfn_t operator|(bfn_t x, bfn_t y) {
+  uint8_t arg0 = static_cast<uint8_t>(x);
+  uint8_t arg1 = static_cast<uint8_t>(y);
+  uint8_t res = arg0 | arg1;
+  return static_cast<bfn_t>(res);
+}
+
+static constexpr bfn_t operator&(bfn_t x, bfn_t y) {
+  uint8_t arg0 = static_cast<uint8_t>(x);
+  uint8_t arg1 = static_cast<uint8_t>(y);
+  uint8_t res = arg0 & arg1;
+  return static_cast<bfn_t>(res);
+}
+
+static constexpr bfn_t operator^(bfn_t x, bfn_t y) {
+  uint8_t arg0 = static_cast<uint8_t>(x);
+  uint8_t arg1 = static_cast<uint8_t>(y);
+  uint8_t res = arg0 ^ arg1;
+  return static_cast<bfn_t>(res);
+}
+
+/// Performs binary function computation with three vector operands.
+/// @tparam FuncControl boolean function control expressed with bfn_t
+/// enum values.
+/// @tparam T type of the input vector element.
+/// @tparam N size of the input vector.
+/// @param s0 First boolean function argument.
+/// @param s1 Second boolean function argument.
+/// @param s2 Third boolean function argument.
+template <bfn_t FuncControl, typename T, int N>
+__ESIMD_API std::enable_if_t<std::is_integral_v<T>, __ESIMD_NS::simd<T, N>>
+bfn(__ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T, N> src1,
+    __ESIMD_NS::simd<T, N> src2) {
+  if constexpr ((sizeof(T) == 8) || ((sizeof(T) == 1) && (N % 4 == 0)) ||
+                ((sizeof(T) == 2) && (N % 2 == 0))) {
+    // Bitcast Nx8-byte vectors to 2xN vectors of 4-byte integers.
+    // Bitcast Nx1-byte vectors to N/4 vectors of 4-byte integers.
+    // Bitcast Nx2-byte vectors to N/2 vectors of 4-byte integers.
+    auto Result = __ESIMD_ENS::bfn<FuncControl>(
+        src0.template bit_cast_view<int32_t>().read(),
+        src1.template bit_cast_view<int32_t>().read(),
+        src2.template bit_cast_view<int32_t>().read());
+    return Result.template bit_cast_view<T>();
+  } else if constexpr (sizeof(T) == 2 || sizeof(T) == 4) {
+    constexpr uint8_t FC = static_cast<uint8_t>(FuncControl);
+    return __esimd_bfn<FC, T, N>(src0.data(), src1.data(), src2.data());
+  } else if constexpr (N % 2 == 0) {
+    // Bitcast Nx1-byte vectors (N is even) to N/2 vectors of 2-byte integers.
+    auto Result = __ESIMD_ENS::bfn<FuncControl>(
+        src0.template bit_cast_view<int16_t>().read(),
+        src1.template bit_cast_view<int16_t>().read(),
+        src2.template bit_cast_view<int16_t>().read());
+    return Result.template bit_cast_view<T>();
+  } else {
+    // Odd number of 1-byte elements.
+    __ESIMD_NS::simd<T, N + 1> Src0, Src1, Src2;
+    Src0.template select<N, 1>() = src0;
+    Src1.template select<N, 1>() = src1;
+    Src2.template select<N, 1>() = src2;
+    auto Result = __ESIMD_ENS::bfn<FuncControl>(Src0, Src1, Src2);
+    return Result.template select<N, 1>();
+  }
+}
+
+/// Performs binary function computation with three scalar operands.
+/// @tparam FuncControl boolean function control expressed with bfn_t enum
+/// values.
+/// @tparam T type of the input vector element.
+/// @param s0 First boolean function argument.
+/// @param s1 Second boolean function argument.
+/// @param s2 Third boolean function argument.
+template <bfn_t FuncControl, typename T>
+ESIMD_NODEBUG ESIMD_INLINE std::enable_if_t<
+    __ESIMD_DNS::is_esimd_scalar<T>::value && std::is_integral_v<T>, T>
+bfn(T src0, T src1, T src2) {
+  __ESIMD_NS::simd<T, 1> Src0 = src0;
+  __ESIMD_NS::simd<T, 1> Src1 = src1;
+  __ESIMD_NS::simd<T, 1> Src2 = src2;
+  __ESIMD_NS::simd<T, 1> Result =
+      esimd::bfn<FuncControl, T, 1>(Src0, Src1, Src2);
+  return Result[0];
+}
+
+/// @} sycl_esimd_logical
+
 } // namespace ext::intel::experimental::esimd
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
index c604bf249ca1b..201a92ab53ec4 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
@@ -2665,7 +2665,9 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N,
           lsc_data_size DS = lsc_data_size::default_size,
           cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
           typename Toffset>
-__ESIMD_API __ESIMD_NS::simd<T, N>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 0,
+    __ESIMD_NS::simd<T, N>>
 lsc_atomic_update(T *p, __ESIMD_NS::simd<Toffset, N> offsets,
                   __ESIMD_NS::simd_mask<N> pred) {
   static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
@@ -2697,7 +2699,9 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N,
           cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
           typename Toffset,
           typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
-__ESIMD_API __ESIMD_NS::simd<T, N>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 0,
+    __ESIMD_NS::simd<T, N>>
 lsc_atomic_update(T *p, __ESIMD_NS::simd_view<Toffset, RegionTy> offsets,
                   __ESIMD_NS::simd_mask<N> pred = 1) {
   using Ty = typename __ESIMD_NS::simd_view<Toffset, RegionTy>::element_type;
@@ -2709,8 +2713,10 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N,
           lsc_data_size DS = lsc_data_size::default_size,
           cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
           typename Toffset>
-__ESIMD_API
-    std::enable_if_t<std::is_integral_v<Toffset>, __ESIMD_NS::simd<T, N>>
+__ESIMD_API std::enable_if_t<
+    std::is_integral_v<Toffset> &&
+        __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 0,
+    __ESIMD_NS::simd<T, N>>
     lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd_mask<N> pred = 1) {
   return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(
       p, __ESIMD_NS::simd<Toffset, N>(offset), pred);
@@ -2735,7 +2741,9 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N,
           lsc_data_size DS = lsc_data_size::default_size,
           cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
           typename Toffset>
-__ESIMD_API __ESIMD_NS::simd<T, N>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 1,
+    __ESIMD_NS::simd<T, N>>
 lsc_atomic_update(T *p, __ESIMD_NS::simd<Toffset, N> offsets,
                   __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd_mask<N> pred) {
   static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
@@ -2768,7 +2776,9 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N,
           cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
           typename Toffset,
           typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
-__ESIMD_API __ESIMD_NS::simd<T, N>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 1,
+    __ESIMD_NS::simd<T, N>>
 lsc_atomic_update(T *p, __ESIMD_NS::simd_view<Toffset, RegionTy> offsets,
                   __ESIMD_NS::simd<T, N> src0,
                   __ESIMD_NS::simd_mask<N> pred = 1) {
@@ -2781,11 +2791,13 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N,
           lsc_data_size DS = lsc_data_size::default_size,
           cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
           typename Toffset>
-__ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
-                                 ((Op != __ESIMD_NS::atomic_op::store &&
-                                   Op != __ESIMD_NS::atomic_op::xchg) ||
-                                  N == 1),
-                             __ESIMD_NS::simd<T, N>>
+__ESIMD_API std::enable_if_t<
+    std::is_integral_v<Toffset> &&
+        __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 1 &&
+        ((Op != __ESIMD_NS::atomic_op::store &&
+          Op != __ESIMD_NS::atomic_op::xchg) ||
+         N == 1),
+    __ESIMD_NS::simd<T, N>>
 lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd<T, N> src0,
                   __ESIMD_NS::simd_mask<N> pred = 1) {
   return lsc_atomic_update<Op, T, N, DS, L1H, L3H>(
@@ -2811,7 +2823,9 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N,
           lsc_data_size DS = lsc_data_size::default_size,
           cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
           typename Toffset>
-__ESIMD_API __ESIMD_NS::simd<T, N>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 2,
+    __ESIMD_NS::simd<T, N>>
 lsc_atomic_update(T *p, __ESIMD_NS::simd<Toffset, N> offsets,
                   __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T, N> src1,
                   __ESIMD_NS::simd_mask<N> pred) {
@@ -2846,7 +2860,9 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N,
           cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
           typename Toffset,
           typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
-__ESIMD_API __ESIMD_NS::simd<T, N>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 2,
+    __ESIMD_NS::simd<T, N>>
 lsc_atomic_update(T *p, __ESIMD_NS::simd_view<Toffset, RegionTy> offsets,
                   __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T, N> src1,
                   __ESIMD_NS::simd_mask<N> pred = 1) {
@@ -2859,8 +2875,10 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N,
           lsc_data_size DS = lsc_data_size::default_size,
           cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
           typename Toffset>
-__ESIMD_API
-    std::enable_if_t<std::is_integral_v<Toffset>, __ESIMD_NS::simd<T, N>>
+__ESIMD_API std::enable_if_t<
+    std::is_integral_v<Toffset> &&
+        __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op<Op>()>() == 2,
+    __ESIMD_NS::simd<T, N>>
     lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd<T, N> src0,
                       __ESIMD_NS::simd<T, N> src1,
                       __ESIMD_NS::simd_mask<N> pred = 1) {
@@ -3082,58 +3100,64 @@ namespace esimd {
 /// <tt>native::lsc::atomic_op</tt> instead of <tt>atomic_op</tt> as atomic
 /// operation template argument.
 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset>
-__ESIMD_API simd<T, N> atomic_update(T *p, simd<Toffset, N> offset,
-                                     simd_mask<N> mask) {
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0, simd<T, N>>
+atomic_update(T *p, simd<Toffset, N> offset, simd_mask<N> mask) {
   return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
       p, offset, mask);
 }
 
 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset,
           typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
-__ESIMD_API simd<T, N> atomic_update(T *p, simd_view<Toffset, RegionTy> offsets,
-                                     simd_mask<N> mask = 1) {
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0, simd<T, N>>
+atomic_update(T *p, simd_view<Toffset, RegionTy> offsets,
+              simd_mask<N> mask = 1) {
   return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
       p, offsets, mask);
 }
 
 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset>
-__ESIMD_API
-    std::enable_if_t<std::is_integral_v<Toffset>, __ESIMD_NS::simd<T, N>>
-    atomic_update(T *p, Toffset offset, simd_mask<N> mask = 1) {
+__ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
+                                 __ESIMD_DNS::get_num_args<Op>() == 0,
+                             simd<T, N>>
+atomic_update(T *p, Toffset offset, simd_mask<N> mask = 1) {
   return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
       p, offset, mask);
 }
 
 /// LSC version of the single-argument atomic update.
 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset>
-__ESIMD_API simd<T, N> atomic_update(T *p, simd<Toffset, N> offset,
-                                     simd<T, N> src0, simd_mask<N> mask) {
+__ESIMD_API __ESIMD_API
+    std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1, simd<T, N>>
+    atomic_update(T *p, simd<Toffset, N> offset, simd<T, N> src0,
+                  simd_mask<N> mask) {
   return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
       p, offset, src0, mask);
 }
 
 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset,
           typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
-__ESIMD_API simd<T, N> atomic_update(T *p, simd_view<Toffset, RegionTy> offsets,
-                                     simd<T, N> src0, simd_mask<N> mask = 1) {
+__ESIMD_API __ESIMD_API
+    std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1, simd<T, N>>
+    atomic_update(T *p, simd_view<Toffset, RegionTy> offsets, simd<T, N> src0,
+                  simd_mask<N> mask = 1) {
   return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
       p, offsets, src0, mask);
 }
 
 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset>
-__ESIMD_API
-    std::enable_if_t<std::is_integral_v<Toffset>, __ESIMD_NS::simd<T, N>>
-    atomic_update(T *p, Toffset offset, simd<T, N> src0,
-                  simd_mask<N> mask = 1) {
+__ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
+                                 __ESIMD_DNS::get_num_args<Op>() == 1,
+                             simd<T, N>>
+atomic_update(T *p, Toffset offset, simd<T, N> src0, simd_mask<N> mask = 1) {
   return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
       p, offset, src0, mask);
 }
 
 /// LSC version of the two-argument atomic update.
 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset>
-__ESIMD_API simd<T, N> atomic_update(T *p, simd<Toffset, N> offset,
-                                     simd<T, N> src0, simd<T, N> src1,
-                                     simd_mask<N> mask) {
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2, simd<T, N>>
+atomic_update(T *p, simd<Toffset, N> offset, simd<T, N> src0, simd<T, N> src1,
+              simd_mask<N> mask) {
   // 2-argument lsc_atomic_update arguments order matches the standard one -
   // expected value first, then new value. But atomic_update uses reverse
   // order, hence the src1/src0 swap.
@@ -3143,18 +3167,19 @@ __ESIMD_API simd<T, N> atomic_update(T *p, simd<Toffset, N> offset,
 
 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset,
           typename RegionTy = __ESIMD_NS::region1d_t<Toffset, N, 1>>
-__ESIMD_API simd<T, N> atomic_update(T *p, simd_view<Toffset, RegionTy> offsets,
-                                     simd<T, N> src0, simd<T, N> src1,
-                                     simd_mask<N> mask = 1) {
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2, simd<T, N>>
+atomic_update(T *p, simd_view<Toffset, RegionTy> offsets, simd<T, N> src0,
+              simd<T, N> src1, simd_mask<N> mask = 1) {
   return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
       p, offsets, src1, src0, mask);
 }
 
 template <native::lsc::atomic_op Op, typename T, int N, typename Toffset>
-__ESIMD_API
-    std::enable_if_t<std::is_integral_v<Toffset>, __ESIMD_NS::simd<T, N>>
-    atomic_update(T *p, Toffset offset, simd<T, N> src0, simd<T, N> src1,
-                  simd_mask<N> mask = 1) {
+__ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> &&
+                                 __ESIMD_DNS::get_num_args<Op>() == 2,
+                             __ESIMD_NS::simd<T, N>>
+atomic_update(T *p, Toffset offset, simd<T, N> src0, simd<T, N> src1,
+              simd_mask<N> mask = 1) {
   return __ESIMD_ENS::lsc_atomic_update<detail::to_atomic_op<Op>(), T, N>(
       p, offset, src1, src0, mask);
 }
diff --git a/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp b/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp
index 018eb56612ea3..0853ff55274c3 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp
@@ -194,6 +194,19 @@ template <typename T> struct unwrap_uniform<uniform<T>> {
   static T impl(uniform<T> val) { return val; }
 };
 
+// Verify the callee return type matches the subgroup size as is required by the
+// spec. For example: simd<int, 8> foo(simd<int,16>); The return type vector
+// length (8) does not match the subgroup size (16).
+template <auto SgSize, typename SimdRet>
+constexpr void verify_return_type_matches_sg_size() {
+  if constexpr (is_simd_or_mask_type<SimdRet>::value) {
+    constexpr auto RetVecLength = SimdRet::size();
+    static_assert(RetVecLength == SgSize,
+                  "invoke_simd callee return type vector length must match "
+                  "kernel subgroup size");
+  }
+}
+
 // Deduces subgroup size of the caller based on given SIMD callable and
 // corresponding SPMD arguments it is being invoke with via invoke_simd.
 // Basically, for each supported subgroup size, this meta-function finds out if
@@ -349,6 +362,8 @@ __attribute__((always_inline)) auto invoke_simd(sycl::sub_group sg,
   // is fine in this case.
   constexpr int N = detail::get_sg_size<Callable, T...>();
   using RetSpmd = detail::SpmdRetType<N, Callable, T...>;
+  detail::verify_return_type_matches_sg_size<
+      N, detail::SimdRetType<N, Callable, T...>>();
   constexpr bool is_function = detail::is_function_ptr_or_ref_v<Callable>;
 
   if constexpr (is_function) {
diff --git a/sycl/include/sycl/ext/oneapi/functional.hpp b/sycl/include/sycl/ext/oneapi/functional.hpp
index e9f21be8ce16f..b8be0c6573620 100644
--- a/sycl/include/sycl/ext/oneapi/functional.hpp
+++ b/sycl/include/sycl/ext/oneapi/functional.hpp
@@ -8,7 +8,9 @@
 
 #pragma once
 #include <sycl/functional.hpp>
+#include <sycl/half_type.hpp>
 
+#include <complex>
 #include <functional>
 
 namespace sycl {
@@ -31,6 +33,7 @@ namespace detail {
 struct GroupOpISigned {};
 struct GroupOpIUnsigned {};
 struct GroupOpFP {};
+struct GroupOpC {};
 
 template <typename T, typename = void> struct GroupOpTag;
 
@@ -49,6 +52,14 @@ struct GroupOpTag<T, detail::enable_if_t<detail::is_sgenfloat<T>::value>> {
   using type = GroupOpFP;
 };
 
+template <typename T>
+struct GroupOpTag<
+    T, detail::enable_if_t<std::is_same<T, std::complex<half>>::value ||
+                           std::is_same<T, std::complex<float>>::value ||
+                           std::is_same<T, std::complex<double>>::value>> {
+  using type = GroupOpC;
+};
+
 #define __SYCL_CALC_OVERLOAD(GroupTag, SPIRVOperation, BinaryOperation)        \
   template <typename T, __spv::GroupOperation O, __spv::Scope::Flag S>         \
   static T calc(GroupTag, T x, BinaryOperation) {                              \
@@ -83,6 +94,7 @@ __SYCL_CALC_OVERLOAD(GroupOpFP, FAdd, sycl::plus<T>)
 __SYCL_CALC_OVERLOAD(GroupOpISigned, IMulKHR, sycl::multiplies<T>)
 __SYCL_CALC_OVERLOAD(GroupOpIUnsigned, IMulKHR, sycl::multiplies<T>)
 __SYCL_CALC_OVERLOAD(GroupOpFP, FMulKHR, sycl::multiplies<T>)
+__SYCL_CALC_OVERLOAD(GroupOpC, CMulINTEL, sycl::multiplies<T>)
 
 __SYCL_CALC_OVERLOAD(GroupOpISigned, BitwiseOrKHR, sycl::bit_or<T>)
 __SYCL_CALC_OVERLOAD(GroupOpIUnsigned, BitwiseOrKHR, sycl::bit_or<T>)
diff --git a/sycl/include/sycl/ext/oneapi/weak_object_base.hpp b/sycl/include/sycl/ext/oneapi/weak_object_base.hpp
index 52877d9d64c9f..7dc10e7e86e1a 100644
--- a/sycl/include/sycl/ext/oneapi/weak_object_base.hpp
+++ b/sycl/include/sycl/ext/oneapi/weak_object_base.hpp
@@ -33,6 +33,9 @@ template <typename SYCLObjT> class weak_object_base {
   weak_object_base(const weak_object_base &Other) noexcept = default;
   weak_object_base(weak_object_base &&Other) noexcept = default;
 
+  weak_object_base &operator=(const weak_object_base &Other) noexcept = default;
+  weak_object_base &operator=(weak_object_base &&Other) noexcept = default;
+
   void reset() noexcept { MObjWeakPtr.reset(); }
   void swap(weak_object_base &Other) noexcept {
     MObjWeakPtr.swap(Other.MObjWeakPtr);
diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp
index b2b801986cbc6..1fa39d5ba3b5c 100644
--- a/sycl/include/sycl/group_algorithm.hpp
+++ b/sycl/include/sycl/group_algorithm.hpp
@@ -106,12 +106,19 @@ using is_plus = std::integral_constant<
     bool, std::is_same<BinaryOperation, sycl::plus<T>>::value ||
               std::is_same<BinaryOperation, sycl::plus<void>>::value>;
 
+// ---- is_multiplies
+template <typename T, typename BinaryOperation>
+using is_multiplies = std::integral_constant<
+    bool, std::is_same<BinaryOperation, sycl::multiplies<T>>::value ||
+              std::is_same<BinaryOperation, sycl::multiplies<void>>::value>;
+
 // ---- is_complex
 // NOTE: std::complex<long double> not yet supported by group algorithms.
 template <typename T>
 struct is_complex
     : std::integral_constant<bool,
-                             std::is_same<T, std::complex<float>>::value ||
+                             std::is_same<T, std::complex<half>>::value ||
+                                 std::is_same<T, std::complex<float>>::value ||
                                  std::is_same<T, std::complex<double>>::value> {
 };
 
@@ -120,12 +127,19 @@ template <typename T>
 using is_arithmetic_or_complex =
     std::integral_constant<bool, sycl::detail::is_complex<T>::value ||
                                      sycl::detail::is_arithmetic<T>::value>;
-// ---- is_plus_if_complex
+
+template <typename T>
+struct is_vector_arithmetic_or_complex
+    : bool_constant<is_vec<T>::value &&
+                    (is_arithmetic<T>::value ||
+                     is_complex<vector_element_t<T>>::value)> {};
+
+// ---- is_plus_or_multiplies_if_complex
 template <typename T, typename BinaryOperation>
-using is_plus_if_complex =
-    std::integral_constant<bool, (is_complex<T>::value
-                                      ? is_plus<T, BinaryOperation>::value
-                                      : std::true_type::value)>;
+using is_plus_or_multiplies_if_complex = std::integral_constant<
+    bool, (is_complex<T>::value ? (is_plus<T, BinaryOperation>::value ||
+                                   is_multiplies<T, BinaryOperation>::value)
+                                : std::true_type::value)>;
 
 // ---- identity_for_ga_op
 //   the group algorithms support std::complex, limited to sycl::plus operation
@@ -139,6 +153,13 @@ identity_for_ga_op() {
   return {0, 0};
 }
 
+template <typename T, class BinaryOperation>
+constexpr detail::enable_if_t<
+    (is_complex<T>::value && is_multiplies<T, BinaryOperation>::value), T>
+identity_for_ga_op() {
+  return {1, 0};
+}
+
 template <typename T, class BinaryOperation>
 constexpr detail::enable_if_t<!is_complex<T>::value, T> identity_for_ga_op() {
   return sycl::known_identity_v<BinaryOperation, T>;
@@ -171,7 +192,9 @@ Function for_each(Group g, Ptr first, Ptr last, Function f) {
 
 template <typename Group, typename T, class BinaryOperation>
 detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
-                     detail::is_scalar_arithmetic<T>::value &&
+                     (detail::is_scalar_arithmetic<T>::value ||
+                      (detail::is_complex<T>::value &&
+                       detail::is_multiplies<T, BinaryOperation>::value)) &&
                      detail::is_native_op<T, BinaryOperation>::value),
                     T>
 reduce_over_group(Group, T x, BinaryOperation binary_op) {
@@ -217,7 +240,7 @@ reduce_over_group(Group g, T x, BinaryOperation binary_op) {
 template <typename Group, typename T, int N, class BinaryOperation>
 detail::enable_if_t<
     (is_group_v<std::decay_t<Group>> &&
-     detail::is_vector_arithmetic<sycl::vec<T, N>>::value &&
+     detail::is_vector_arithmetic_or_complex<sycl::vec<T, N>>::value &&
      detail::is_native_op<sycl::vec<T, N>, BinaryOperation>::value),
     sycl::vec<T, N>>
 reduce_over_group(Group g, sycl::vec<T, N> x, BinaryOperation binary_op) {
@@ -244,8 +267,8 @@ detail::enable_if_t<
      (detail::is_scalar_arithmetic<T>::value || detail::is_complex<T>::value) &&
      detail::is_native_op<V, BinaryOperation>::value &&
      detail::is_native_op<T, BinaryOperation>::value &&
-     detail::is_plus_if_complex<T, BinaryOperation>::value &&
-     detail::is_plus_if_complex<V, BinaryOperation>::value),
+     detail::is_plus_or_multiplies_if_complex<T, BinaryOperation>::value &&
+     detail::is_plus_or_multiplies_if_complex<V, BinaryOperation>::value),
     T>
 reduce_over_group(Group g, V x, T init, BinaryOperation binary_op) {
   // FIXME: Do not special-case for half precision
@@ -265,8 +288,8 @@ reduce_over_group(Group g, V x, T init, BinaryOperation binary_op) {
 
 template <typename Group, typename V, typename T, class BinaryOperation>
 detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
-                     detail::is_vector_arithmetic<V>::value &&
-                     detail::is_vector_arithmetic<T>::value &&
+                     detail::is_vector_arithmetic_or_complex<V>::value &&
+                     detail::is_vector_arithmetic_or_complex<T>::value &&
                      detail::is_native_op<V, BinaryOperation>::value &&
                      detail::is_native_op<T, BinaryOperation>::value),
                     T>
@@ -297,8 +320,8 @@ detail::enable_if_t<
     (is_group_v<std::decay_t<Group>> && detail::is_pointer<Ptr>::value &&
      detail::is_arithmetic_or_complex<
          typename detail::remove_pointer<Ptr>::type>::value &&
-     detail::is_plus_if_complex<typename detail::remove_pointer<Ptr>::type,
-                                BinaryOperation>::value),
+     detail::is_plus_or_multiplies_if_complex<
+         typename detail::remove_pointer<Ptr>::type, BinaryOperation>::value),
     typename detail::remove_pointer<Ptr>::type>
 joint_reduce(Group g, Ptr first, Ptr last, BinaryOperation binary_op) {
 #ifdef __SYCL_DEVICE_ONLY__
@@ -323,9 +346,9 @@ detail::enable_if_t<
      detail::is_arithmetic_or_complex<T>::value &&
      detail::is_native_op<typename detail::remove_pointer<Ptr>::type,
                           BinaryOperation>::value &&
-     detail::is_plus_if_complex<typename detail::remove_pointer<Ptr>::type,
-                                BinaryOperation>::value &&
-     detail::is_plus_if_complex<T, BinaryOperation>::value &&
+     detail::is_plus_or_multiplies_if_complex<
+         typename detail::remove_pointer<Ptr>::type, BinaryOperation>::value &&
+     detail::is_plus_or_multiplies_if_complex<T, BinaryOperation>::value &&
      detail::is_native_op<T, BinaryOperation>::value),
     T>
 joint_reduce(Group g, Ptr first, Ptr last, T init, BinaryOperation binary_op) {
@@ -600,7 +623,9 @@ group_broadcast(Group g, T x) {
 //   vector
 template <typename Group, typename T, class BinaryOperation>
 detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
-                     detail::is_scalar_arithmetic<T>::value &&
+                     (detail::is_scalar_arithmetic<T>::value ||
+                      (detail::is_complex<T>::value &&
+                       detail::is_multiplies<T, BinaryOperation>::value)) &&
                      detail::is_native_op<T, BinaryOperation>::value),
                     T>
 exclusive_scan_over_group(Group, T x, BinaryOperation binary_op) {
@@ -644,7 +669,7 @@ exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
 
 template <typename Group, typename T, class BinaryOperation>
 detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
-                     detail::is_vector_arithmetic<T>::value &&
+                     detail::is_vector_arithmetic_or_complex<T>::value &&
                      detail::is_native_op<T, BinaryOperation>::value),
                     T>
 exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
@@ -666,8 +691,8 @@ exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
 // once for vector_arithmetic, once for (scalar_arithmetic || complex)
 template <typename Group, typename V, typename T, class BinaryOperation>
 detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
-                     detail::is_vector_arithmetic<V>::value &&
-                     detail::is_vector_arithmetic<T>::value &&
+                     detail::is_vector_arithmetic_or_complex<V>::value &&
+                     detail::is_vector_arithmetic_or_complex<T>::value &&
                      detail::is_native_op<V, BinaryOperation>::value &&
                      detail::is_native_op<T, BinaryOperation>::value),
                     T>
@@ -693,8 +718,8 @@ detail::enable_if_t<
      (detail::is_scalar_arithmetic<T>::value || detail::is_complex<T>::value) &&
      detail::is_native_op<V, BinaryOperation>::value &&
      detail::is_native_op<T, BinaryOperation>::value &&
-     detail::is_plus_if_complex<V, BinaryOperation>::value &&
-     detail::is_plus_if_complex<T, BinaryOperation>::value),
+     detail::is_plus_or_multiplies_if_complex<V, BinaryOperation>::value &&
+     detail::is_plus_or_multiplies_if_complex<T, BinaryOperation>::value),
     T>
 exclusive_scan_over_group(Group g, V x, T init, BinaryOperation binary_op) {
   // FIXME: Do not special-case for half precision
@@ -732,9 +757,10 @@ detail::enable_if_t<
      detail::is_native_op<typename detail::remove_pointer<InPtr>::type,
                           BinaryOperation>::value &&
      detail::is_native_op<T, BinaryOperation>::value &&
-     detail::is_plus_if_complex<typename detail::remove_pointer<InPtr>::type,
-                                BinaryOperation>::value &&
-     detail::is_plus_if_complex<T, BinaryOperation>::value),
+     detail::is_plus_or_multiplies_if_complex<
+         typename detail::remove_pointer<InPtr>::type,
+         BinaryOperation>::value &&
+     detail::is_plus_or_multiplies_if_complex<T, BinaryOperation>::value),
     OutPtr>
 joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, T init,
                      BinaryOperation binary_op) {
@@ -787,8 +813,8 @@ detail::enable_if_t<
          typename detail::remove_pointer<InPtr>::type>::value &&
      detail::is_native_op<typename detail::remove_pointer<InPtr>::type,
                           BinaryOperation>::value &&
-     detail::is_plus_if_complex<typename detail::remove_pointer<InPtr>::type,
-                                BinaryOperation>::value),
+     detail::is_plus_or_multiplies_if_complex<
+         typename detail::remove_pointer<InPtr>::type, BinaryOperation>::value),
     OutPtr>
 joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
                      BinaryOperation binary_op) {
@@ -811,7 +837,7 @@ joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
 //   complex
 template <typename Group, typename T, class BinaryOperation>
 detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
-                     detail::is_vector_arithmetic<T>::value &&
+                     detail::is_vector_arithmetic_or_complex<T>::value &&
                      detail::is_native_op<T, BinaryOperation>::value),
                     T>
 inclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
@@ -831,7 +857,9 @@ inclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
 
 template <typename Group, typename T, class BinaryOperation>
 detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
-                     detail::is_scalar_arithmetic<T>::value &&
+                     (detail::is_scalar_arithmetic<T>::value ||
+                      (detail::is_complex<T>::value &&
+                       detail::is_multiplies<T, BinaryOperation>::value)) &&
                      detail::is_native_op<T, BinaryOperation>::value),
                     T>
 inclusive_scan_over_group(Group, T x, BinaryOperation binary_op) {
@@ -881,8 +909,8 @@ detail::enable_if_t<
      (detail::is_scalar_arithmetic<T>::value || detail::is_complex<T>::value) &&
      detail::is_native_op<V, BinaryOperation>::value &&
      detail::is_native_op<T, BinaryOperation>::value &&
-     detail::is_plus_if_complex<T, BinaryOperation>::value &&
-     detail::is_plus_if_complex<V, BinaryOperation>::value),
+     detail::is_plus_or_multiplies_if_complex<T, BinaryOperation>::value &&
+     detail::is_plus_or_multiplies_if_complex<V, BinaryOperation>::value),
     T>
 inclusive_scan_over_group(Group g, V x, BinaryOperation binary_op, T init) {
   // FIXME: Do not special-case for half precision
@@ -904,8 +932,8 @@ inclusive_scan_over_group(Group g, V x, BinaryOperation binary_op, T init) {
 
 template <typename Group, typename V, class BinaryOperation, typename T>
 detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
-                     detail::is_vector_arithmetic<V>::value &&
-                     detail::is_vector_arithmetic<T>::value &&
+                     detail::is_vector_arithmetic_or_complex<V>::value &&
+                     detail::is_vector_arithmetic_or_complex<T>::value &&
                      detail::is_native_op<V, BinaryOperation>::value &&
                      detail::is_native_op<T, BinaryOperation>::value),
                     T>
@@ -935,9 +963,10 @@ detail::enable_if_t<
      detail::is_native_op<typename detail::remove_pointer<InPtr>::type,
                           BinaryOperation>::value &&
      detail::is_native_op<T, BinaryOperation>::value &&
-     detail::is_plus_if_complex<typename detail::remove_pointer<InPtr>::type,
-                                BinaryOperation>::value &&
-     detail::is_plus_if_complex<T, BinaryOperation>::value),
+     detail::is_plus_or_multiplies_if_complex<
+         typename detail::remove_pointer<InPtr>::type,
+         BinaryOperation>::value &&
+     detail::is_plus_or_multiplies_if_complex<T, BinaryOperation>::value),
     OutPtr>
 joint_inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
                      BinaryOperation binary_op, T init) {
@@ -989,8 +1018,8 @@ detail::enable_if_t<
          typename detail::remove_pointer<InPtr>::type>::value &&
      detail::is_native_op<typename detail::remove_pointer<InPtr>::type,
                           BinaryOperation>::value &&
-     detail::is_plus_if_complex<typename detail::remove_pointer<InPtr>::type,
-                                BinaryOperation>::value),
+     detail::is_plus_or_multiplies_if_complex<
+         typename detail::remove_pointer<InPtr>::type, BinaryOperation>::value),
     OutPtr>
 joint_inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
                      BinaryOperation binary_op) {
diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index 0cf58142e43a5..b98567b3cf94b 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -2507,8 +2507,8 @@ class __SYCL_EXPORT handler {
 
     // Do the following:
     // 1. If both are host, use host_task to copy.
-    // 2. If either pointer is host or of the backend supports native memcpy2d,
-    //    use special command.
+    // 2. If either pointer is host or the backend supports native memcpy2d, use
+    //    special command.
     // 3. Otherwise, launch a kernel for copying.
     if (SrcIsHost && DestIsHost) {
       commonUSMCopy2DFallbackHostTask<T>(Src, SrcPitch, Dest, DestPitch, Width,
diff --git a/sycl/include/sycl/kernel_bundle.hpp b/sycl/include/sycl/kernel_bundle.hpp
index 6eed7b270edca..12651407a537f 100644
--- a/sycl/include/sycl/kernel_bundle.hpp
+++ b/sycl/include/sycl/kernel_bundle.hpp
@@ -308,17 +308,19 @@ class kernel_bundle : public detail::kernel_bundle_plain,
   template <auto &SpecName>
   typename std::remove_reference_t<decltype(SpecName)>::value_type
   get_specialization_constant() const {
-    const char *SpecSymName = detail::get_spec_constant_symbolic_ID<SpecName>();
-    if (!is_specialization_constant_set(SpecSymName))
-      return SpecName.getDefaultValue();
-
     using SCType =
         typename std::remove_reference_t<decltype(SpecName)>::value_type;
 
+    const char *SpecSymName = detail::get_spec_constant_symbolic_ID<SpecName>();
+    SCType Res{SpecName.getDefaultValue()};
+    if (!is_specialization_constant_set(SpecSymName))
+      return Res;
+
     std::array<char, sizeof(SCType)> RetValue;
     get_specialization_constant_impl(SpecSymName, RetValue.data());
+    std::memcpy(&Res, RetValue.data(), sizeof(SCType));
 
-    return *reinterpret_cast<SCType *>(RetValue.data());
+    return Res;
   }
 
   /// \returns an iterator to the first device image kernel_bundle contains
diff --git a/sycl/include/sycl/multi_ptr.hpp b/sycl/include/sycl/multi_ptr.hpp
index ecfacec93b7c9..fecbfbfed620a 100644
--- a/sycl/include/sycl/multi_ptr.hpp
+++ b/sycl/include/sycl/multi_ptr.hpp
@@ -149,7 +149,7 @@ class multi_ptr {
                 (Space == access::address_space::generic_space ||
                  Space == access::address_space::local_space)>>
   multi_ptr(local_accessor<ElementType, Dimensions> Accessor)
-      : multi_ptr(Accessor.get_pointer().get()) {}
+      : m_Pointer(detail::cast_AS<decorated_type *>(Accessor.get_pointer())) {}
 
   // The following constructors are necessary to create multi_ptr<const
   // ElementType, Space, DecorateAddress> from accessor<ElementType, ...>.
@@ -210,7 +210,7 @@ class multi_ptr {
   multi_ptr(local_accessor<typename detail::remove_const_t<RelayElementType>,
                            Dimensions>
                 Accessor)
-      : multi_ptr(Accessor.get_pointer().get()) {}
+      : m_Pointer(detail::cast_AS<decorated_type *>(Accessor.get_pointer())) {}
 
   // Assignment and access operators
   multi_ptr &operator=(const multi_ptr &) = default;
@@ -465,7 +465,7 @@ class multi_ptr<const void, Space, DecorateAddress> {
       typename = typename detail::enable_if_t<
           RelaySpace == Space && Space == access::address_space::local_space>>
   multi_ptr(local_accessor<ElementType, Dimensions> Accessor)
-      : multi_ptr(Accessor.get_pointer().get()) {}
+      : m_Pointer(detail::cast_AS<decorated_type *>(Accessor.get_pointer())) {}
 
   // Assignment operators
   multi_ptr &operator=(const multi_ptr &) = default;
@@ -591,7 +591,7 @@ class multi_ptr<void, Space, DecorateAddress> {
       typename = typename detail::enable_if_t<
           RelaySpace == Space && Space == access::address_space::local_space>>
   multi_ptr(local_accessor<ElementType, Dimensions> Accessor)
-      : multi_ptr(Accessor.get_pointer().get()) {}
+      : m_Pointer(detail::cast_AS<decorated_type *>(Accessor.get_pointer())) {}
 
   // Assignment operators
   multi_ptr &operator=(const multi_ptr &) = default;
@@ -848,7 +848,7 @@ class multi_ptr<ElementType, Space, access::decorated::legacy> {
           std::is_const<ET>::value && std::is_same<ET, ElementType>::value>>
   multi_ptr(
       local_accessor<typename detail::remove_const_t<ET>, dimensions> Accessor)
-      : multi_ptr(Accessor.get_pointer()) {}
+      : m_Pointer(detail::cast_AS<pointer_t>(Accessor.get_pointer())) {}
 
   // Only if Space == constant_space and element type is const
   template <
@@ -1089,7 +1089,7 @@ class multi_ptr<void, Space, access::decorated::legacy> {
           _Space == Space && (Space == access::address_space::generic_space ||
                               Space == access::address_space::local_space)>>
   multi_ptr(local_accessor<ElementType, dimensions> Accessor)
-      : multi_ptr(Accessor.get_pointer()) {}
+      : m_Pointer(detail::cast_AS<pointer_t>(Accessor.get_pointer())) {}
 
   // Only if Space == constant_space
   template <
@@ -1232,7 +1232,7 @@ class multi_ptr<const void, Space, access::decorated::legacy> {
           _Space == Space && (Space == access::address_space::generic_space ||
                               Space == access::address_space::local_space)>>
   multi_ptr(local_accessor<ElementType, dimensions> Accessor)
-      : multi_ptr(Accessor.get_pointer()) {}
+      : m_Pointer(detail::cast_AS<pointer_t>(Accessor.get_pointer())) {}
 
   // Only if Space == constant_space
   template <
diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp
index 94fa6fadae41a..47dc36c3e0bc6 100644
--- a/sycl/include/sycl/reduction.hpp
+++ b/sycl/include/sycl/reduction.hpp
@@ -422,6 +422,13 @@ class ReductionIdentityContainer<
   static constexpr bool has_identity = false;
 };
 
+// Token class to help with the in-place construction of reducers.
+template <class BinaryOperation, typename IdentityContainerT>
+struct ReducerToken {
+  const IdentityContainerT &IdentityContainer;
+  const BinaryOperation BOp;
+};
+
 } // namespace detail
 
 /// Specialization of the generic class 'reducer'. It is used for reductions
@@ -458,6 +465,14 @@ class reducer<
   reducer(const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
       : MValue(GetInitialValue(IdentityContainer)),
         MIdentity(IdentityContainer), MBinaryOp(BOp) {}
+  reducer(
+      const detail::ReducerToken<BinaryOperation, IdentityContainerT> &Token)
+      : reducer(Token.IdentityContainer, Token.BOp) {}
+
+  reducer(const reducer &) = delete;
+  reducer(reducer &&) = delete;
+  reducer &operator=(const reducer &) = delete;
+  reducer &operator=(reducer &&) = delete;
 
   reducer &combine(const T &Partial) {
     if constexpr (has_identity)
@@ -515,6 +530,14 @@ class reducer<
   reducer() : MValue(getIdentity()) {}
   reducer(const IdentityContainerT & /* Identity */, BinaryOperation)
       : MValue(getIdentity()) {}
+  reducer(
+      const detail::ReducerToken<BinaryOperation, IdentityContainerT> &Token)
+      : reducer(Token.IdentityContainer, Token.BOp) {}
+
+  reducer(const reducer &) = delete;
+  reducer(reducer &&) = delete;
+  reducer &operator=(const reducer &) = delete;
+  reducer &operator=(reducer &&) = delete;
 
   reducer &combine(const T &Partial) {
     BinaryOperation BOp;
@@ -553,6 +576,14 @@ class reducer<T, BinaryOperation, Dims, Extent, IdentityContainerT, View,
 public:
   reducer(internal_value_type &Ref, BinaryOperation BOp)
       : MElement(Ref), MBinaryOp(BOp) {}
+  reducer(
+      const detail::ReducerToken<BinaryOperation, IdentityContainerT> &Token)
+      : reducer(Token.IdentityContainer, Token.BOp) {}
+
+  reducer(const reducer &) = delete;
+  reducer(reducer &&) = delete;
+  reducer &operator=(const reducer &) = delete;
+  reducer &operator=(reducer &&) = delete;
 
   reducer &combine(const T &Partial) {
     if constexpr (has_identity)
@@ -599,6 +630,14 @@ class reducer<
   reducer(const IdentityContainerT &IdentityContainer, BinaryOperation BOp)
       : MValue(GetInitialValue(IdentityContainer)),
         MIdentity(IdentityContainer), MBinaryOp(BOp) {}
+  reducer(
+      const detail::ReducerToken<BinaryOperation, IdentityContainerT> &Token)
+      : reducer(Token.IdentityContainer, Token.BOp) {}
+
+  reducer(const reducer &) = delete;
+  reducer(reducer &&) = delete;
+  reducer &operator=(const reducer &) = delete;
+  reducer &operator=(reducer &&) = delete;
 
   reducer<T, BinaryOperation, Dims - 1, Extent, IdentityContainerT, true>
   operator[](size_t Index) {
@@ -650,6 +689,14 @@ class reducer<
   reducer() : MValue(getIdentity()) {}
   reducer(const IdentityContainerT & /* Identity */, BinaryOperation)
       : MValue(getIdentity()) {}
+  reducer(
+      const detail::ReducerToken<BinaryOperation, IdentityContainerT> &Token)
+      : reducer(Token.IdentityContainer, Token.BOp) {}
+
+  reducer(const reducer &) = delete;
+  reducer(reducer &&) = delete;
+  reducer &operator=(const reducer &) = delete;
+  reducer &operator=(reducer &&) = delete;
 
   // SYCL 2020 revision 4 says this should be const, but this is a bug
   // see https://github.com/KhronosGroup/SYCL-Docs/pull/252
@@ -746,6 +793,8 @@ class reduction_impl_algo {
 
   using identity_container_type =
       ReductionIdentityContainer<T, BinaryOperation, ExplicitIdentity>;
+  using reducer_token_type =
+      detail::ReducerToken<BinaryOperation, identity_container_type>;
   using reducer_type =
       reducer<T, BinaryOperation, Dims, Extent, identity_container_type>;
   using result_type = T;
@@ -2062,8 +2111,11 @@ void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
       // Pass all reductions to user's lambda in the same order as supplied
       // Each reducer initializes its own storage
       auto ReduIndices = std::index_sequence_for<Reductions...>();
-      auto ReducersTuple = std::tuple{typename Reductions::reducer_type{
-          std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple)}...};
+      auto ReducerTokensTuple =
+          std::tuple{typename Reductions::reducer_token_type{
+              std::get<Is>(IdentitiesTuple), std::get<Is>(BOPsTuple)}...};
+      auto ReducersTuple = std::tuple<typename Reductions::reducer_type...>{
+          std::get<Is>(ReducerTokensTuple)...};
       std::apply([&](auto &...Reducers) { KernelFunc(NDIt, Reducers...); },
                  ReducersTuple);
 
diff --git a/sycl/include/sycl/types.hpp b/sycl/include/sycl/types.hpp
index 05cea2cf0f043..ae1b2655d7320 100644
--- a/sycl/include/sycl/types.hpp
+++ b/sycl/include/sycl/types.hpp
@@ -729,9 +729,6 @@ template <typename Type, int NumElements> class vec {
                 std::index_sequence<Is...>)
       : m_Data{Arr[Is]...} {}
 
-  constexpr vec(const std::array<vec_data_t<DataT>, NumElements> &Arr)
-      : vec{Arr, std::make_index_sequence<NumElements>()} {}
-
 public:
   using element_type = DataT;
   using rel_t = detail::rel_t<DataT>;
@@ -796,7 +793,8 @@ template <typename Type, int NumElements> class vec {
   template <typename Ty = DataT>
   explicit constexpr vec(const EnableIfHostHalf<Ty> &arg)
       : vec{detail::RepeatValue<NumElements>(
-            static_cast<vec_data_t<DataT>>(arg))} {}
+                static_cast<vec_data_t<DataT>>(arg)),
+            std::make_index_sequence<NumElements>()} {}
 
   template <typename Ty = DataT>
   typename detail::enable_if_t<
@@ -812,7 +810,8 @@ template <typename Type, int NumElements> class vec {
 #else
   explicit constexpr vec(const DataT &arg)
       : vec{detail::RepeatValue<NumElements>(
-            static_cast<vec_data_t<DataT>>(arg))} {}
+                static_cast<vec_data_t<DataT>>(arg)),
+            std::make_index_sequence<NumElements>()} {}
 
   template <typename Ty = DataT>
   typename detail::enable_if_t<
@@ -883,7 +882,8 @@ template <typename Type, int NumElements> class vec {
   template <typename... argTN, typename = EnableIfSuitableTypes<argTN...>,
             typename = EnableIfSuitableNumElements<argTN...>>
   constexpr vec(const argTN &...args)
-      : vec{VecArgArrayCreator<vec_data_t<DataT>, argTN...>::Create(args...)} {}
+      : vec{VecArgArrayCreator<vec_data_t<DataT>, argTN...>::Create(args...),
+            std::make_index_sequence<NumElements>()} {}
 
   // TODO: Remove, for debug purposes only.
   void dump() {
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 2d88978d87780..9e493502645a9 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -1948,18 +1948,14 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
   }
 
   case PI_DEVICE_INFO_UUID: {
-    int driver_version = 0;
-    cuDriverGetVersion(&driver_version);
-    int major = driver_version / 1000;
-    int minor = driver_version % 1000 / 10;
     CUuuid uuid;
-    if ((major > 11) || (major == 11 && minor >= 4)) {
-      sycl::detail::pi::assertion(cuDeviceGetUuid_v2(&uuid, device->get()) ==
-                                  CUDA_SUCCESS);
-    } else {
-      sycl::detail::pi::assertion(cuDeviceGetUuid(&uuid, device->get()) ==
-                                  CUDA_SUCCESS);
-    }
+#if (CUDA_VERSION >= 11040)
+    sycl::detail::pi::assertion(cuDeviceGetUuid_v2(&uuid, device->get()) ==
+                                CUDA_SUCCESS);
+#else
+    sycl::detail::pi::assertion(cuDeviceGetUuid(&uuid, device->get()) ==
+                                CUDA_SUCCESS);
+#endif
     std::array<unsigned char, 16> name;
     std::copy(uuid.bytes, uuid.bytes + 16, name.begin());
     return getInfoArray(16, param_value_size, param_value, param_value_size_ret,
diff --git a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp
index 0fc2a5a10f4f9..5bb0ce881e79f 100644
--- a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp
+++ b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp
@@ -805,6 +805,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
     CASE_PI_UNSUPPORTED(PI_DEVICE_INFO_IMAGE_SRGB)
     CASE_PI_UNSUPPORTED(PI_DEVICE_INFO_ATOMIC_64)
     CASE_PI_UNSUPPORTED(PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES)
+    CASE_PI_UNSUPPORTED(PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES)
     CASE_PI_UNSUPPORTED(PI_EXT_ONEAPI_DEVICE_INFO_MAX_GLOBAL_WORK_GROUPS)
     CASE_PI_UNSUPPORTED(PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_1D)
     CASE_PI_UNSUPPORTED(PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_2D)
diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp
index ddca2a872adfc..b3b2276fdfa5a 100644
--- a/sycl/plugins/hip/pi_hip.cpp
+++ b/sycl/plugins/hip/pi_hip.cpp
@@ -1863,9 +1863,33 @@ pi_result hip_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     return getInfo(param_value_size, param_value, param_value_size_ret,
                    capabilities);
   }
+
+  case PI_DEVICE_INFO_DEVICE_ID: {
+    int value = 0;
+    sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&value, hipDeviceAttributePciDeviceId,
+                              device->get()) == hipSuccess);
+    sycl::detail::pi::assertion(value >= 0);
+    return getInfo(param_value_size, param_value, param_value_size_ret, value);
+  }
+
+  case PI_DEVICE_INFO_UUID: {
+#if ((HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR >= 2) ||                     \
+     HIP_VERSION_MAJOR > 5)
+    hipUUID uuid = {};
+    // Supported since 5.2+
+    sycl::detail::pi::assertion(hipDeviceGetUuid(&uuid, device->get()) ==
+                                hipSuccess);
+    std::array<unsigned char, 16> name;
+    std::copy(uuid.bytes, uuid.bytes + 16, name.begin());
+    return getInfoArray(16, param_value_size, param_value, param_value_size_ret,
+                        name.data());
+#endif
+    return PI_ERROR_INVALID_VALUE;
+  }
+
   // TODO: Investigate if this information is available on HIP.
   case PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES:
-  case PI_DEVICE_INFO_DEVICE_ID:
   case PI_DEVICE_INFO_PCI_ADDRESS:
   case PI_DEVICE_INFO_GPU_EU_COUNT:
   case PI_DEVICE_INFO_GPU_EU_SIMD_WIDTH:
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index c6658245cc03e..43b8d936ed88e 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -20,6 +20,7 @@
 #include <set>
 #include <sstream>
 #include <string>
+#include <sycl/detail/pi.h>
 #include <sycl/detail/spinlock.hpp>
 #include <thread>
 #include <utility>
@@ -2308,6 +2309,13 @@ pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName,
   case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT:
     // 2D USM fill and memset is not supported.
     return ReturnValue(pi_bool{false});
+  case PI_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
+    pi_memory_order_capabilities capabilities =
+        PI_MEMORY_ORDER_RELAXED | PI_MEMORY_ORDER_ACQUIRE |
+        PI_MEMORY_ORDER_RELEASE | PI_MEMORY_ORDER_ACQ_REL |
+        PI_MEMORY_ORDER_SEQ_CST;
+    return ReturnValue(capabilities);
+  }
   case PI_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES:
   default:
     // TODO: implement other parameters
@@ -3057,8 +3065,7 @@ pi_result piMemRetain(pi_mem Mem) {
 // If indirect access tracking is not enabled then this functions just performs
 // zeMemFree. If indirect access tracking is enabled then reference counting is
 // performed.
-static pi_result ZeMemFreeHelper(pi_context Context, void *Ptr,
-                                 bool OwnZeMemHandle = true) {
+static pi_result ZeMemFreeHelper(pi_context Context, void *Ptr) {
   pi_platform Plt = Context->getPlatform();
   std::unique_lock<pi_shared_mutex> ContextsLock(Plt->ContextsMutex,
                                                  std::defer_lock);
@@ -3078,8 +3085,7 @@ static pi_result ZeMemFreeHelper(pi_context Context, void *Ptr,
     Context->MemAllocs.erase(It);
   }
 
-  if (OwnZeMemHandle)
-    ZE_CALL(zeMemFree, (Context->ZeContext, Ptr));
+  ZE_CALL(zeMemFree, (Context->ZeContext, Ptr));
 
   if (IndirectAccessTrackingEnabled)
     PI_CALL(ContextReleaseHelper(Context));
@@ -3088,7 +3094,7 @@ static pi_result ZeMemFreeHelper(pi_context Context, void *Ptr,
 }
 
 static pi_result USMFreeHelper(pi_context Context, void *Ptr,
-                               bool OwnZeMemHandle);
+                               bool OwnZeMemHandle = true);
 
 pi_result piMemRelease(pi_mem Mem) {
   PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
@@ -7042,10 +7048,8 @@ static pi_result USMHostAllocImpl(void **ResultPtr, pi_context Context,
   return PI_SUCCESS;
 }
 
-static pi_result USMFreeImpl(pi_context Context, void *Ptr,
-                             bool OwnZeMemHandle) {
-  if (OwnZeMemHandle)
-    ZE_CALL(zeMemFree, (Context->ZeContext, Ptr));
+static pi_result USMFreeImpl(pi_context Context, void *Ptr) {
+  ZE_CALL(zeMemFree, (Context->ZeContext, Ptr));
   return PI_SUCCESS;
 }
 
@@ -7104,8 +7108,8 @@ void *USMMemoryAllocBase::allocate(size_t Size, size_t Alignment) {
   return Ptr;
 }
 
-void USMMemoryAllocBase::deallocate(void *Ptr, bool OwnZeMemHandle) {
-  auto Res = USMFreeImpl(Context, Ptr, OwnZeMemHandle);
+void USMMemoryAllocBase::deallocate(void *Ptr) {
+  auto Res = USMFreeImpl(Context, Ptr);
   if (Res != PI_SUCCESS) {
     throw UsmAllocationException(Res);
   }
@@ -7353,8 +7357,13 @@ static pi_result USMFreeHelper(pi_context Context, void *Ptr,
     Context->MemAllocs.erase(It);
   }
 
+  if (!OwnZeMemHandle) {
+    // Memory should not be freed
+    return PI_SUCCESS;
+  }
+
   if (!UseUSMAllocator) {
-    pi_result Res = USMFreeImpl(Context, Ptr, OwnZeMemHandle);
+    pi_result Res = USMFreeImpl(Context, Ptr);
     if (IndirectAccessTrackingEnabled)
       PI_CALL(ContextReleaseHelper(Context));
     return Res;
@@ -7373,7 +7382,7 @@ static pi_result USMFreeHelper(pi_context Context, void *Ptr,
   // If memory type is host release from host pool
   if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_HOST) {
     try {
-      Context->HostMemAllocContext->deallocate(Ptr, OwnZeMemHandle);
+      Context->HostMemAllocContext->deallocate(Ptr);
     } catch (const UsmAllocationException &Ex) {
       return Ex.getError();
     } catch (...) {
@@ -7401,16 +7410,16 @@ static pi_result USMFreeHelper(pi_context Context, void *Ptr,
     PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
 
     auto DeallocationHelper =
-        [Context, Device, Ptr,
-         OwnZeMemHandle](std::unordered_map<ze_device_handle_t, USMAllocContext>
-                             &AllocContextMap) {
+        [Context, Device,
+         Ptr](std::unordered_map<ze_device_handle_t, USMAllocContext>
+                  &AllocContextMap) {
           try {
             auto It = AllocContextMap.find(Device->ZeDevice);
             if (It == AllocContextMap.end())
               return PI_ERROR_INVALID_VALUE;
 
             // The right context is found, deallocate the pointer
-            It->second.deallocate(Ptr, OwnZeMemHandle);
+            It->second.deallocate(Ptr);
           } catch (const UsmAllocationException &Ex) {
             return Ex.getError();
           }
@@ -7436,7 +7445,7 @@ static pi_result USMFreeHelper(pi_context Context, void *Ptr,
     }
   }
 
-  pi_result Res = USMFreeImpl(Context, Ptr, OwnZeMemHandle);
+  pi_result Res = USMFreeImpl(Context, Ptr);
   if (SharedReadOnlyAllocsIterator != Context->SharedReadOnlyAllocs.end()) {
     Context->SharedReadOnlyAllocs.erase(SharedReadOnlyAllocsIterator);
   }
@@ -7451,7 +7460,7 @@ pi_result piextUSMFree(pi_context Context, void *Ptr) {
   std::scoped_lock<pi_shared_mutex> Lock(
       IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex);
 
-  return USMFreeHelper(Context, Ptr, true /* OwnZeMemHandle */);
+  return USMFreeHelper(Context, Ptr);
 }
 
 pi_result piextKernelSetArgPointer(pi_kernel Kernel, pi_uint32 ArgIndex,
@@ -8371,11 +8380,11 @@ pi_result _pi_buffer::free() {
       std::scoped_lock<pi_shared_mutex> Lock(
           IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex);
 
-      PI_CALL(USMFreeHelper(Context, ZeHandle, true));
+      PI_CALL(USMFreeHelper(Context, ZeHandle));
       break;
     }
     case allocation_t::free_native:
-      PI_CALL(ZeMemFreeHelper(Context, ZeHandle, true));
+      PI_CALL(ZeMemFreeHelper(Context, ZeHandle));
       break;
     case allocation_t::unimport:
       ZeUSMImport.doZeUSMRelease(Context->getPlatform()->ZeDriver, ZeHandle);
diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp
index f4b7d1098efe5..a7979a6413508 100644
--- a/sycl/plugins/level_zero/pi_level_zero.hpp
+++ b/sycl/plugins/level_zero/pi_level_zero.hpp
@@ -112,7 +112,7 @@ class USMMemoryAllocBase : public SystemMemory {
       : Context{Ctx}, Device{Dev} {}
   void *allocate(size_t Size) override final;
   void *allocate(size_t Size, size_t Alignment) override final;
-  void deallocate(void *Ptr, bool OwnZeMemHandle) override final;
+  void deallocate(void *Ptr) override final;
 };
 
 // Allocation routines for shared memory type
diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp
index 2c44f0cfe9eb3..f2ea816b023f1 100644
--- a/sycl/plugins/opencl/pi_opencl.cpp
+++ b/sycl/plugins/opencl/pi_opencl.cpp
@@ -282,9 +282,126 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
     // For details about Intel UUID extension, see
     // sycl/doc/extensions/supported/sycl_ext_intel_device_info.md
   case PI_DEVICE_INFO_UUID:
-  case PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES:
-  case PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES:
     return PI_ERROR_INVALID_VALUE;
+  case PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
+    // This query is missing beore OpenCL 3.0
+    // Check version and handle appropriately
+    OCLV::OpenCLVersion devVer;
+    cl_device_id deviceID = cast<cl_device_id>(device);
+    cl_int ret_err = getDeviceVersion(deviceID, devVer);
+    if (ret_err != CL_SUCCESS) {
+      return cast<pi_result>(ret_err);
+    }
+
+    // Minimum required capability to be returned
+    // For OpenCL 1.2, this is all that is required
+    pi_memory_order_capabilities capabilities = PI_MEMORY_ORDER_RELAXED;
+
+    if (devVer >= OCLV::V3_0) {
+      // For OpenCL >=3.0, the query should be implemented
+      cl_device_atomic_capabilities cl_capabilities = 0;
+      cl_int ret_err = clGetDeviceInfo(
+          deviceID, CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES,
+          sizeof(cl_device_atomic_capabilities), &cl_capabilities, nullptr);
+      if (ret_err != CL_SUCCESS)
+        return cast<pi_result>(ret_err);
+
+      // Mask operation to only consider atomic_memory_order* capabilities
+      cl_int mask = CL_DEVICE_ATOMIC_ORDER_RELAXED |
+                    CL_DEVICE_ATOMIC_ORDER_ACQ_REL |
+                    CL_DEVICE_ATOMIC_ORDER_SEQ_CST;
+      cl_capabilities &= mask;
+
+      // The memory order capabilities are hierarchical, if one is implied, all
+      // preceding capbilities are implied as well. Especially in the case of
+      // ACQ_REL.
+      if (cl_capabilities & CL_DEVICE_ATOMIC_ORDER_SEQ_CST) {
+        capabilities |= PI_MEMORY_ORDER_SEQ_CST;
+      }
+      if (cl_capabilities & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) {
+        capabilities |= PI_MEMORY_ORDER_ACQ_REL | PI_MEMORY_ORDER_ACQUIRE |
+                        PI_MEMORY_ORDER_RELEASE;
+      }
+    } else if (devVer >= OCLV::V2_0) {
+      // For OpenCL 2.x, return all capabilities
+      // (https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_API.html#_memory_consistency_model)
+      capabilities |= PI_MEMORY_ORDER_ACQUIRE | PI_MEMORY_ORDER_RELEASE |
+                      PI_MEMORY_ORDER_ACQ_REL | PI_MEMORY_ORDER_SEQ_CST;
+    }
+
+    if (paramValue) {
+      if (paramValueSize < sizeof(pi_memory_order_capabilities))
+        return static_cast<pi_result>(CL_INVALID_VALUE);
+
+      std::memcpy(paramValue, &capabilities, sizeof(capabilities));
+    }
+
+    if (paramValueSizeRet)
+      *paramValueSizeRet = sizeof(capabilities);
+
+    return static_cast<pi_result>(CL_SUCCESS);
+  }
+  case PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
+    // Initialize result to minimum mandated capabilities according to
+    // SYCL2020 4.6.3.2
+    // Because scopes are hierarchical, wider scopes support all narrower
+    // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and
+    // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382)
+    pi_memory_scope_capabilities result = PI_MEMORY_SCOPE_WORK_ITEM |
+                                          PI_MEMORY_SCOPE_SUB_GROUP |
+                                          PI_MEMORY_SCOPE_WORK_GROUP;
+
+    OCLV::OpenCLVersion devVer;
+
+    cl_device_id deviceID = cast<cl_device_id>(device);
+    cl_int ret_err = getDeviceVersion(deviceID, devVer);
+    if (ret_err != CL_SUCCESS)
+      return static_cast<pi_result>(ret_err);
+
+    cl_device_atomic_capabilities devCapabilities = 0;
+    if (devVer >= OCLV::V3_0) {
+      ret_err = clGetDeviceInfo(deviceID, CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES,
+                                sizeof(cl_device_atomic_capabilities),
+                                &devCapabilities, nullptr);
+      if (ret_err != CL_SUCCESS)
+        return static_cast<pi_result>(ret_err);
+      assert((devCapabilities & CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP) &&
+             "Violates minimum mandated guarantee");
+
+      // Because scopes are hierarchical, wider scopes support all narrower
+      // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and
+      // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382)
+      // We already initialized to these minimum mandated capabilities. Just
+      // check wider scopes.
+      if (devCapabilities & CL_DEVICE_ATOMIC_SCOPE_DEVICE) {
+        result |= PI_MEMORY_SCOPE_DEVICE;
+      }
+
+      if (devCapabilities & CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES) {
+        result |= PI_MEMORY_SCOPE_SYSTEM;
+      }
+
+    } else {
+      // This info is only available in OpenCL version >= 3.0
+      // Just return minimum mandated capabilities for older versions.
+      // OpenCL 1.x minimum mandated capabilities are WORK_GROUP, we
+      // already initialized using it.
+      if (devVer >= OCLV::V2_0) {
+        // OpenCL 2.x minimum mandated capabilities are WORK_GROUP | DEVICE |
+        // ALL_DEVICES
+        result |= PI_MEMORY_SCOPE_DEVICE | PI_MEMORY_SCOPE_SYSTEM;
+      }
+    }
+    if (paramValue) {
+      if (paramValueSize < sizeof(cl_device_atomic_capabilities))
+        return PI_ERROR_INVALID_VALUE;
+
+      std::memcpy(paramValue, &result, sizeof(result));
+    }
+    if (paramValueSizeRet)
+      *paramValueSizeRet = sizeof(result);
+    return PI_SUCCESS;
+  }
   case PI_DEVICE_INFO_ATOMIC_64: {
     cl_int ret_err = CL_SUCCESS;
     cl_bool result = CL_FALSE;
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index ffe3bb6555e06..fef607f3ef185 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -485,6 +485,8 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
        (ur_device_info_t)UR_DEVICE_INFO_BFLOAT16},
       {PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES,
        (ur_device_info_t)UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES},
+      {PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES,
+       (ur_device_info_t)UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES},
   };
 
   auto InfoType = InfoMapping.find(ParamName);
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp
index 934084ae67736..c859c166c7735 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp
@@ -1173,9 +1173,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     // bfloat16 math functions are not yet supported on Intel GPUs.
     return ReturnValue(bool{false});
   }
+  case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
+    // There are no explicit restrictions in L0 programming guide, so assume all
+    // are supported
+    ur_memory_scope_capability_flags_t result =
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM;
+
+    return ReturnValue(result);
+  }
+
+  case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
+    ur_memory_order_capability_flags_t capabilities =
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST;
+    return ReturnValue(capabilities);
+  }
 
   // TODO: Implement.
-  case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES:
   default:
     zePrint("Unsupported ParamName in piGetDeviceInfo\n");
     zePrint("ParamName=%d(0x%x)\n", ParamName, ParamName);
@@ -1706,7 +1727,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDevicePartition(
   // Currently supported partitioning (by affinity domain/numa) would always
   // partition to all sub-devices.
   //
-  if (NumDevices !=0)
+  if (NumDevices != 0)
     PI_ASSERT(NumDevices == EffectiveNumDevices, UR_RESULT_ERROR_INVALID_VALUE);
 
   for (uint32_t I = 0; I < NumDevices; I++) {
diff --git a/sycl/plugins/unified_runtime/ur/usm_allocator.cpp b/sycl/plugins/unified_runtime/ur/usm_allocator.cpp
index 1716e7d546246..31490a6ce8513 100644
--- a/sycl/plugins/unified_runtime/ur/usm_allocator.cpp
+++ b/sycl/plugins/unified_runtime/ur/usm_allocator.cpp
@@ -290,7 +290,7 @@ class USMAllocContext::USMAllocImpl {
 
   void *allocate(size_t Size, size_t Alignment, bool &FromPool);
   void *allocate(size_t Size, bool &FromPool);
-  void deallocate(void *Ptr, bool &ToPool, bool OwnZeMemHandle);
+  void deallocate(void *Ptr, bool &ToPool);
 
   SystemMemory &getMemHandle() { return *MemHandle; }
 
@@ -332,7 +332,7 @@ Slab::Slab(Bucket &Bkt)
 
 Slab::~Slab() {
   unregSlab(*this);
-  bucket.getMemHandle().deallocate(MemPtr, true /* OwnZeMemHandle */);
+  bucket.getMemHandle().deallocate(MemPtr);
 }
 
 // Return the index of the first available chunk, -1 otherwize
@@ -737,8 +737,7 @@ Bucket &USMAllocContext::USMAllocImpl::findBucket(size_t Size) {
   return *(*It);
 }
 
-void USMAllocContext::USMAllocImpl::deallocate(void *Ptr, bool &ToPool,
-                                               bool OwnZeMemHandle) {
+void USMAllocContext::USMAllocImpl::deallocate(void *Ptr, bool &ToPool) {
   auto *SlabPtr = AlignPtrDown(Ptr, SlabMinSize());
 
   // Lock the map on read
@@ -748,7 +747,7 @@ void USMAllocContext::USMAllocImpl::deallocate(void *Ptr, bool &ToPool,
   auto Slabs = getKnownSlabs().equal_range(SlabPtr);
   if (Slabs.first == Slabs.second) {
     Lk.unlock();
-    getMemHandle().deallocate(Ptr, OwnZeMemHandle);
+    getMemHandle().deallocate(Ptr);
     return;
   }
 
@@ -779,7 +778,7 @@ void USMAllocContext::USMAllocImpl::deallocate(void *Ptr, bool &ToPool,
   // There is a rare case when we have a pointer from system allocation next
   // to some slab with an entry in the map. So we find a slab
   // but the range checks fail.
-  getMemHandle().deallocate(Ptr, OwnZeMemHandle);
+  getMemHandle().deallocate(Ptr);
 }
 
 USMAllocContext::USMAllocContext(std::unique_ptr<SystemMemory> MemHandle,
@@ -813,9 +812,9 @@ void *USMAllocContext::allocate(size_t size, size_t alignment) {
   return Ptr;
 }
 
-void USMAllocContext::deallocate(void *ptr, bool OwnZeMemHandle) {
+void USMAllocContext::deallocate(void *ptr) {
   bool ToPool;
-  pImpl->deallocate(ptr, ToPool, OwnZeMemHandle);
+  pImpl->deallocate(ptr, ToPool);
 
   if (pImpl->getParams().PoolTrace > 2) {
     auto MT = pImpl->getParams().memoryTypeName;
diff --git a/sycl/plugins/unified_runtime/ur/usm_allocator.hpp b/sycl/plugins/unified_runtime/ur/usm_allocator.hpp
index 6d3fa279bde1d..58a3187b00d3e 100755
--- a/sycl/plugins/unified_runtime/ur/usm_allocator.hpp
+++ b/sycl/plugins/unified_runtime/ur/usm_allocator.hpp
@@ -17,7 +17,7 @@ class SystemMemory {
 public:
   virtual void *allocate(size_t size) = 0;
   virtual void *allocate(size_t size, size_t aligned) = 0;
-  virtual void deallocate(void *ptr, bool OwnZeMemHandle) = 0;
+  virtual void deallocate(void *ptr) = 0;
   virtual ~SystemMemory() = default;
 };
 
@@ -68,7 +68,7 @@ class USMAllocContext {
 
   void *allocate(size_t size);
   void *allocate(size_t size, size_t alignment);
-  void deallocate(void *ptr, bool OwnZeMemHandle);
+  void deallocate(void *ptr);
 
 private:
   std::unique_ptr<USMAllocImpl> pImpl;
diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp
index 22818eeed302c..198b4f1cc2d30 100644
--- a/sycl/source/detail/context_impl.cpp
+++ b/sycl/source/detail/context_impl.cpp
@@ -23,6 +23,8 @@
 #include <sycl/property_list.hpp>
 #include <sycl/stl.hpp>
 
+#include <algorithm>
+
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
@@ -166,17 +168,26 @@ template <>
 std::vector<sycl::memory_order>
 context_impl::get_info<info::context::atomic_memory_order_capabilities>()
     const {
+  std::vector<sycl::memory_order> CapabilityList{
+      sycl::memory_order::relaxed, sycl::memory_order::acquire,
+      sycl::memory_order::release, sycl::memory_order::acq_rel,
+      sycl::memory_order::seq_cst};
   if (is_host())
-    return {sycl::memory_order::relaxed, sycl::memory_order::acquire,
-            sycl::memory_order::release, sycl::memory_order::acq_rel,
-            sycl::memory_order::seq_cst};
+    return CapabilityList;
+
+  for (const sycl::device &Device : MDevices) {
+    std::vector<sycl::memory_order> NewCapabilityList(CapabilityList.size());
+    std::vector<sycl::memory_order> DeviceCapabilities =
+        Device.get_info<info::device::atomic_memory_order_capabilities>();
+    std::set_intersection(
+        CapabilityList.begin(), CapabilityList.end(),
+        DeviceCapabilities.begin(), DeviceCapabilities.end(),
+        std::inserter(NewCapabilityList, NewCapabilityList.begin()));
+    CapabilityList = NewCapabilityList;
+  }
+  CapabilityList.shrink_to_fit();
 
-  pi_memory_order_capabilities Result;
-  getPlugin().call<PiApiKind::piContextGetInfo>(
-      MContext,
-      PiInfoCode<info::context::atomic_memory_order_capabilities>::value,
-      sizeof(Result), &Result, nullptr);
-  return readMemoryOrderBitfield(Result);
+  return CapabilityList;
 }
 template <>
 std::vector<sycl::memory_scope>
diff --git a/sycl/source/detail/context_info.hpp b/sycl/source/detail/context_info.hpp
index 1056557ec2bad..d30113dc6d08e 100644
--- a/sycl/source/detail/context_info.hpp
+++ b/sycl/source/detail/context_info.hpp
@@ -29,17 +29,6 @@ typename Param::return_type get_context_info(RT::PiContext Ctx,
   return Result;
 }
 
-// Specialization for atomic_memory_order_capabilities, PI returns a bitfield
-template <>
-std::vector<sycl::memory_order>
-get_context_info<info::context::atomic_memory_order_capabilities>(
-    RT::PiContext Ctx, const plugin &Plugin) {
-  pi_memory_order_capabilities Result;
-  Plugin.call<PiApiKind::piContextGetInfo>(
-      Ctx, PiInfoCode<info::context::atomic_memory_order_capabilities>::value,
-      sizeof(Result), &Result, nullptr);
-  return readMemoryOrderBitfield(Result);
-}
 } // namespace detail
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/source/detail/platform_impl.cpp b/sycl/source/detail/platform_impl.cpp
index 1efa49a200f2a..007acae7e13d2 100644
--- a/sycl/source/detail/platform_impl.cpp
+++ b/sycl/source/detail/platform_impl.cpp
@@ -75,23 +75,28 @@ static bool IsBannedPlatform(platform Platform) {
   // To avoid problems on default users and deployment of DPC++ on platforms
   // where CUDA is available, the OpenCL support is disabled.
   //
-  auto IsNVIDIAOpenCL = [](platform Platform) {
+  // There is also no support for the AMD HSA backend for OpenCL consumption,
+  // as well as reported problems with device queries, so AMD OpenCL support
+  // is disabled as well.
+  //
+  auto IsMatchingOpenCL = [](platform Platform, const std::string_view name) {
     if (getSyclObjImpl(Platform)->is_host())
       return false;
 
-    const bool HasCUDA = Platform.get_info<info::platform::name>().find(
-                             "NVIDIA CUDA") != std::string::npos;
+    const bool HasNameMatch = Platform.get_info<info::platform::name>().find(
+                                  name) != std::string::npos;
     const auto Backend =
         detail::getSyclObjImpl(Platform)->getPlugin().getBackend();
-    const bool IsCUDAOCL = (HasCUDA && Backend == backend::opencl);
-    if (detail::pi::trace(detail::pi::TraceLevel::PI_TRACE_ALL) && IsCUDAOCL) {
-      std::cout << "SYCL_PI_TRACE[all]: "
-                << "NVIDIA CUDA OpenCL platform found but is not compatible."
-                << std::endl;
+    const bool IsMatchingOCL = (HasNameMatch && Backend == backend::opencl);
+    if (detail::pi::trace(detail::pi::TraceLevel::PI_TRACE_ALL) &&
+        IsMatchingOCL) {
+      std::cout << "SYCL_PI_TRACE[all]: " << name
+                << " OpenCL platform found but is not compatible." << std::endl;
     }
-    return IsCUDAOCL;
+    return IsMatchingOCL;
   };
-  return IsNVIDIAOpenCL(Platform);
+  return IsMatchingOpenCL(Platform, "NVIDIA CUDA") ||
+         IsMatchingOpenCL(Platform, "AMD Accelerated Parallel Processing");
 }
 
 // This routine has the side effect of registering each platform's last device
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index a197193d35432..d6bc161428708 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -567,7 +567,7 @@ RT::PiProgram ProgramManager::getBuiltPIProgram(
     while (AIt != AEnd) {
       auto Aspect = static_cast<aspect>(*AIt);
       // Strict check for fp64 is disabled temporarily to avoid confusion.
-      if (Aspect != aspect::fp64 && !Dev->has(Aspect))
+      if (!Dev->has(Aspect))
         throw sycl::exception(errc::kernel_not_supported,
                               "Required aspect " + getAspectNameStr(Aspect) +
                                   " is not supported on the device");
@@ -1184,6 +1184,7 @@ void ProgramManager::addImages(pi_device_binaries DeviceBinary) {
     const _pi_offload_entry EntriesB = RawImg->EntriesBegin;
     const _pi_offload_entry EntriesE = RawImg->EntriesEnd;
     auto Img = make_unique_ptr<RTDeviceBinaryImage>(RawImg, M);
+    static uint32_t SequenceID = 0;
 
     // Fill the kernel argument mask map
     const RTDeviceBinaryImage::PropertyRange &KPOIRange =
@@ -1257,6 +1258,13 @@ void ProgramManager::addImages(pi_device_binaries DeviceBinary) {
       if (KSIdIt != KSIdMap.end()) {
         auto &Imgs = m_DeviceImages[KSIdIt->second];
         assert(Imgs && "Device image vector should have been already created");
+        if (DumpImages) {
+          const bool NeedsSequenceID =
+              std::any_of(Imgs->begin(), Imgs->end(), [&](auto &I) {
+                return I->getFormat() == Img->getFormat();
+              });
+          dumpImage(*Img, KSIdIt->second, NeedsSequenceID ? ++SequenceID : 0);
+        }
 
         cacheKernelUsesAssertInfo(M, *Img);
 
@@ -1379,12 +1387,14 @@ ProgramManager::getKernelSetId(OSModuleHandle M,
                       PI_ERROR_INVALID_KERNEL_NAME);
 }
 
-void ProgramManager::dumpImage(const RTDeviceBinaryImage &Img,
-                               KernelSetId KSId) const {
+void ProgramManager::dumpImage(const RTDeviceBinaryImage &Img, KernelSetId KSId,
+                               uint32_t SequenceID) const {
   std::string Fname("sycl_");
   const pi_device_binary_struct &RawImg = Img.getRawData();
   Fname += RawImg.DeviceTargetSpec;
   Fname += std::to_string(KSId);
+  if (SequenceID)
+    Fname += '_' + std::to_string(SequenceID);
   std::string Ext;
 
   RT::PiDeviceBinaryType Format = Img.getFormat();
@@ -1683,46 +1693,120 @@ ProgramManager::getSYCLDeviceImagesWithCompatibleState(
   }
   assert(BinImages.size() > 0 && "Expected to find at least one device image");
 
+  // Ignore images with incompatible state. Image is considered compatible
+  // with a target state if an image is already in the target state or can
+  // be brought to target state by compiling/linking/building.
+  //
+  // Example: an image in "executable" state is not compatible with
+  // "input" target state - there is no operation to convert the image it
+  // to "input" state. An image in "input" state is compatible with
+  // "executable" target state because it can be built to get into
+  // "executable" state.
+  for (auto It = BinImages.begin(); It != BinImages.end();) {
+    if (getBinImageState(*It) > TargetState)
+      It = BinImages.erase(It);
+    else
+      ++It;
+  }
+
   std::vector<device_image_plain> SYCLDeviceImages;
-  for (RTDeviceBinaryImage *BinImage : BinImages) {
-    const bundle_state ImgState = getBinImageState(BinImage);
-
-    // Ignore images with incompatible state. Image is considered compatible
-    // with a target state if an image is already in the target state or can
-    // be brought to target state by compiling/linking/building.
-    //
-    // Example: an image in "executable" state is not compatible with
-    // "input" target state - there is no operation to convert the image it
-    // to "input" state. An image in "input" state is compatible with
-    // "executable" target state because it can be built to get into
-    // "executable" state.
-    if (ImgState > TargetState)
-      continue;
 
-    for (const sycl::device &Dev : Devs) {
+  // If a non-input state is requested, we can filter out some compatible
+  // images and return only those with the highest compatible state for each
+  // device-kernel pair. This map tracks how many kernel-device pairs need each
+  // image, so that any unneeded ones are skipped.
+  // TODO this has no effect if the requested state is input, consider having
+  // a separate branch for that case to avoid unnecessary tracking work.
+  struct DeviceBinaryImageInfo {
+    std::shared_ptr<std::vector<sycl::kernel_id>> KernelIDs;
+    bundle_state State = bundle_state::input;
+    int RequirementCounter = 0;
+  };
+  std::unordered_map<RTDeviceBinaryImage *, DeviceBinaryImageInfo> ImageInfoMap;
+
+  for (const sycl::device &Dev : Devs) {
+    // Track the highest image state for each requested kernel.
+    using StateImagesPairT =
+        std::pair<bundle_state, std::vector<RTDeviceBinaryImage *>>;
+    using KernelImageMapT =
+        std::map<kernel_id, StateImagesPairT, LessByNameComp>;
+    KernelImageMapT KernelImageMap;
+    if (!KernelIDs.empty())
+      for (const kernel_id &KernelID : KernelIDs)
+        KernelImageMap.insert({KernelID, {}});
+
+    for (RTDeviceBinaryImage *BinImage : BinImages) {
       if (!compatibleWithDevice(BinImage, Dev) ||
           !doesDevSupportDeviceRequirements(Dev, *BinImage))
         continue;
 
-      std::shared_ptr<std::vector<sycl::kernel_id>> KernelIDs;
-      // Collect kernel names for the image
-      {
-        std::lock_guard<std::mutex> KernelIDsGuard(m_KernelIDsMutex);
-        KernelIDs = m_BinImg2KernelIDs[BinImage];
-        // If the image does not contain any non-service kernels we can skip it.
-        if (!KernelIDs || KernelIDs->empty())
-          continue;
+      auto InsertRes = ImageInfoMap.insert({BinImage, {}});
+      DeviceBinaryImageInfo &ImgInfo = InsertRes.first->second;
+      if (InsertRes.second) {
+        ImgInfo.State = getBinImageState(BinImage);
+        // Collect kernel names for the image
+        {
+          std::lock_guard<std::mutex> KernelIDsGuard(m_KernelIDsMutex);
+          ImgInfo.KernelIDs = m_BinImg2KernelIDs[BinImage];
+        }
       }
+      const bundle_state ImgState = ImgInfo.State;
+      const std::shared_ptr<std::vector<sycl::kernel_id>> &ImageKernelIDs =
+          ImgInfo.KernelIDs;
+      int &ImgRequirementCounter = ImgInfo.RequirementCounter;
 
-      DeviceImageImplPtr Impl = std::make_shared<detail::device_image_impl>(
-          BinImage, Ctx, Devs, ImgState, KernelIDs, /*PIProgram=*/nullptr);
+      // If the image does not contain any non-service kernels we can skip it.
+      if (!ImageKernelIDs || ImageKernelIDs->empty())
+        continue;
 
-      SYCLDeviceImages.push_back(
-          createSyclObjFromImpl<device_image_plain>(Impl));
-      break;
+      // Update tracked information.
+      for (kernel_id &KernelID : *ImageKernelIDs) {
+        StateImagesPairT *StateImagesPair;
+        // If only specific kernels are requested, ignore the rest.
+        if (!KernelIDs.empty()) {
+          auto It = KernelImageMap.find(KernelID);
+          if (It == KernelImageMap.end())
+            continue;
+          StateImagesPair = &It->second;
+        } else
+          StateImagesPair = &KernelImageMap[KernelID];
+
+        auto &[KernelImagesState, KernelImages] = *StateImagesPair;
+
+        if (KernelImages.empty()) {
+          KernelImagesState = ImgState;
+          KernelImages.push_back(BinImage);
+          ++ImgRequirementCounter;
+        } else if (KernelImagesState < ImgState) {
+          for (RTDeviceBinaryImage *Img : KernelImages) {
+            auto It = ImageInfoMap.find(Img);
+            assert(It != ImageInfoMap.end());
+            assert(It->second.RequirementCounter > 0);
+            --(It->second.RequirementCounter);
+          }
+          KernelImages.clear();
+          KernelImages.push_back(BinImage);
+          KernelImagesState = ImgState;
+          ++ImgRequirementCounter;
+        } else if (KernelImagesState == ImgState) {
+          KernelImages.push_back(BinImage);
+          ++ImgRequirementCounter;
+        }
+      }
     }
   }
 
+  for (const auto &ImgInfoPair : ImageInfoMap) {
+    if (ImgInfoPair.second.RequirementCounter == 0)
+      continue;
+
+    DeviceImageImplPtr Impl = std::make_shared<detail::device_image_impl>(
+        ImgInfoPair.first, Ctx, Devs, ImgInfoPair.second.State,
+        ImgInfoPair.second.KernelIDs, /*PIProgram=*/nullptr);
+
+    SYCLDeviceImages.push_back(createSyclObjFromImpl<device_image_plain>(Impl));
+  }
+
   return SYCLDeviceImages;
 }
 
@@ -2235,7 +2319,7 @@ bool doesDevSupportDeviceRequirements(const device &Dev,
     while (!Aspects.empty()) {
       aspect Aspect = Aspects.consume<aspect>();
       // Strict check for fp64 is disabled temporarily to avoid confusion.
-      if (Aspect != aspect::fp64 && !Dev.has(Aspect))
+      if (!Dev.has(Aspect))
         return false;
     }
   }
diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp
index e85976eb621f4..1bd81491462fa 100644
--- a/sycl/source/detail/program_manager/program_manager.hpp
+++ b/sycl/source/detail/program_manager/program_manager.hpp
@@ -299,7 +299,8 @@ class ProgramManager {
   KernelSetId getKernelSetId(OSModuleHandle M,
                              const std::string &KernelName) const;
   /// Dumps image to current directory
-  void dumpImage(const RTDeviceBinaryImage &Img, KernelSetId KSId) const;
+  void dumpImage(const RTDeviceBinaryImage &Img, KernelSetId KSId,
+                 uint32_t SequenceID = 0) const;
 
   /// Add info on kernels using assert into cache
   void cacheKernelUsesAssertInfo(OSModuleHandle M, RTDeviceBinaryImage &Img);
diff --git a/sycl/test/basic_tests/accessor/accessor_get_pointer.cpp b/sycl/test/basic_tests/accessor/accessor_get_pointer.cpp
new file mode 100644
index 0000000000000..acb63f9ea33d8
--- /dev/null
+++ b/sycl/test/basic_tests/accessor/accessor_get_pointer.cpp
@@ -0,0 +1,27 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -fsyntax-only
+
+#include <cassert>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+
+using namespace sycl;
+
+constexpr static int size = 1;
+
+void test_get_multi_ptr(handler &cgh, buffer<int, size> &buffer) {
+  using target_local_accessor_t =
+      accessor<int, size, access::mode::read_write, access::target::local>;
+  using local_accessor_t = local_accessor<int, size>;
+
+  auto acc = buffer.get_access<access_mode::read_write, target::host_task>(cgh);
+  auto target_local_acc = target_local_accessor_t({size}, cgh);
+  auto local_acc = local_accessor_t({size}, cgh);
+
+  auto acc_ptr = acc.get_pointer();
+  auto target_local_ptr = target_local_acc.get_pointer();
+  auto local_pointer = local_acc.get_pointer();
+  static_assert(std::is_same_v<decltype(acc_ptr), std::add_pointer_t<int>>);
+  static_assert(std::is_same_v<decltype(target_local_ptr), local_ptr<int>>);
+  static_assert(
+      std::is_same_v<decltype(local_pointer), std::add_pointer_t<int>>);
+}
\ No newline at end of file
diff --git a/sycl/test/basic_tests/interop-backend-traits-cuda.cpp b/sycl/test/basic_tests/interop-backend-traits-cuda.cpp
new file mode 100644
index 0000000000000..5ae25a02de048
--- /dev/null
+++ b/sycl/test/basic_tests/interop-backend-traits-cuda.cpp
@@ -0,0 +1,30 @@
+// REQUIRES: cuda_be
+// RUN: %clangxx -fsycl -fsyntax-only %s
+// RUN: %clangxx -fsycl -fsyntax-only -DUSE_CUDA_EXPERIMENTAL %s
+
+#ifdef USE_CUDA_EXPERIMENTAL
+#define SYCL_EXT_ONEAPI_BACKEND_CUDA 1
+#define SYCL_EXT_ONEAPI_BACKEND_CUDA_EXPERIMENTAL 1
+#include <sycl/ext/oneapi/experimental/backend/cuda.hpp>
+#endif
+
+#include <sycl/sycl.hpp>
+
+constexpr auto Backend = sycl::backend::ext_oneapi_cuda;
+
+int main() {
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::device>,
+                     sycl::detail::interop<Backend, sycl::device>::type>);
+#ifndef USE_CUDA_EXPERIMENTAL
+  // CUDA experimental return type is different to input type
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::context>,
+                     sycl::detail::interop<Backend, sycl::context>::type>);
+#endif
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::queue>,
+                     sycl::detail::interop<Backend, sycl::queue>::type>);
+
+  return 0;
+}
diff --git a/sycl/test/basic_tests/interop-backend-traits-hip.cpp b/sycl/test/basic_tests/interop-backend-traits-hip.cpp
new file mode 100644
index 0000000000000..d7a9b2ee8f399
--- /dev/null
+++ b/sycl/test/basic_tests/interop-backend-traits-hip.cpp
@@ -0,0 +1,17 @@
+// REQUIRES: hip_be
+// RUN: %clangxx -fsycl -fsyntax-only %s
+
+#include <sycl/sycl.hpp>
+
+constexpr auto Backend = sycl::backend::ext_oneapi_hip;
+
+int main() {
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::device>,
+                     sycl::detail::interop<Backend, sycl::device>::type>);
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::queue>,
+                     sycl::detail::interop<Backend, sycl::queue>::type>);
+
+  return 0;
+}
diff --git a/sycl/test/basic_tests/interop-backend-traits-level-zero.cpp b/sycl/test/basic_tests/interop-backend-traits-level-zero.cpp
new file mode 100644
index 0000000000000..97dff0d123eb8
--- /dev/null
+++ b/sycl/test/basic_tests/interop-backend-traits-level-zero.cpp
@@ -0,0 +1,24 @@
+// REQUIRES: level_zero_be
+// RUN: %clangxx %fsycl-host-only -fsyntax-only %s
+
+#include <sycl/sycl.hpp>
+#include <ze_api.h>
+
+constexpr auto Backend = sycl::backend::ext_oneapi_level_zero;
+
+int main() {
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::input_type<sycl::platform>,
+                     sycl::detail::interop<Backend, sycl::platform>::type>);
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::platform>,
+                     sycl::detail::interop<Backend, sycl::platform>::type>);
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::device>,
+                     sycl::detail::interop<Backend, sycl::device>::type>);
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::queue>,
+                     sycl::detail::interop<Backend, sycl::queue>::type>);
+
+  return 0;
+}
diff --git a/sycl/test/basic_tests/interop-backend-traits-opencl.cpp b/sycl/test/basic_tests/interop-backend-traits-opencl.cpp
new file mode 100644
index 0000000000000..e2c0c4d856b7d
--- /dev/null
+++ b/sycl/test/basic_tests/interop-backend-traits-opencl.cpp
@@ -0,0 +1,36 @@
+// REQUIRES: opencl_be
+// RUN: %clangxx -fsycl -fsyntax-only %s
+
+#include <CL/cl.h>
+#include <sycl/sycl.hpp>
+
+constexpr auto Backend = sycl::backend::opencl;
+
+int main() {
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::input_type<sycl::device>,
+                     sycl::detail::interop<Backend, sycl::device>::type>);
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::input_type<sycl::context>,
+                     sycl::detail::interop<Backend, sycl::context>::type>);
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::input_type<sycl::queue>,
+                     sycl::detail::interop<Backend, sycl::queue>::type>);
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::input_type<sycl::event>,
+                     sycl::detail::interop<Backend, sycl::event>::type>);
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::input_type<sycl::platform>,
+                     sycl::detail::interop<Backend, sycl::platform>::type>);
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::platform>,
+                     sycl::detail::interop<Backend, sycl::platform>::type>);
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::device>,
+                     sycl::detail::interop<Backend, sycl::device>::type>);
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::queue>,
+                     sycl::detail::interop<Backend, sycl::queue>::type>);
+
+  return 0;
+}
diff --git a/sycl/test/basic_tests/interop-backend-traits.cpp b/sycl/test/basic_tests/interop-backend-traits.cpp
deleted file mode 100644
index b76b5ca1dc8e4..0000000000000
--- a/sycl/test/basic_tests/interop-backend-traits.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-// RUN: %clangxx -fsycl -fsyntax-only -DUSE_OPENCL %s
-// RUN: %clangxx %fsycl-host-only -fsyntax-only -DUSE_L0 %s
-// RUN: %clangxx -fsycl -fsyntax-only -DUSE_CUDA %s
-// RUN: %clangxx -fsycl -fsyntax-only -DUSE_HIP %s
-// RUN: %clangxx -fsycl -fsyntax-only -DUSE_CUDA_EXPERIMENTAL %s
-
-#ifdef USE_OPENCL
-#include <CL/cl.h>
-
-#include <sycl/backend/opencl.hpp>
-
-constexpr auto Backend = sycl::backend::opencl;
-#endif
-
-#ifdef USE_L0
-#include <ze_api.h>
-
-#include <sycl/ext/oneapi/backend/level_zero.hpp>
-
-constexpr auto Backend = sycl::backend::ext_oneapi_level_zero;
-#endif
-
-#ifdef USE_CUDA
-#include <sycl/detail/backend_traits_cuda.hpp>
-
-constexpr auto Backend = sycl::backend::ext_oneapi_cuda;
-#endif
-
-#ifdef USE_HIP
-#include <sycl/detail/backend_traits_hip.hpp>
-
-constexpr auto Backend = sycl::backend::ext_oneapi_hip;
-#endif
-
-#ifdef USE_CUDA_EXPERIMENTAL
-#define SYCL_EXT_ONEAPI_BACKEND_CUDA 1
-#define SYCL_EXT_ONEAPI_BACKEND_CUDA_EXPERIMENTAL 1
-#include <sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp>
-#include <sycl/ext/oneapi/experimental/backend/cuda.hpp>
-
-constexpr auto Backend = sycl::backend::ext_oneapi_cuda;
-#endif
-
-#include <sycl/sycl.hpp>
-
-int main() {
-#ifdef USE_OPENCL
-  static_assert(
-      std::is_same_v<sycl::backend_traits<Backend>::input_type<sycl::device>,
-                     sycl::detail::interop<Backend, sycl::device>::type>);
-  static_assert(
-      std::is_same_v<sycl::backend_traits<Backend>::input_type<sycl::context>,
-                     sycl::detail::interop<Backend, sycl::context>::type>);
-  static_assert(
-      std::is_same_v<sycl::backend_traits<Backend>::input_type<sycl::queue>,
-                     sycl::detail::interop<Backend, sycl::queue>::type>);
-  static_assert(
-      std::is_same_v<sycl::backend_traits<Backend>::input_type<sycl::event>,
-                     sycl::detail::interop<Backend, sycl::event>::type>);
-#endif
-
-// CUDA does not have a native type for platforms
-// HIP also should follow the same behavior - need confirmation
-#if !(defined(USE_CUDA) || defined(USE_HIP))
-  static_assert(
-      std::is_same_v<sycl::backend_traits<Backend>::input_type<sycl::platform>,
-                     sycl::detail::interop<Backend, sycl::platform>::type>);
-  static_assert(
-      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::platform>,
-                     sycl::detail::interop<Backend, sycl::platform>::type>);
-#endif
-
-  static_assert(
-      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::device>,
-                     sycl::detail::interop<Backend, sycl::device>::type>);
-// CUDA experimental return type is different to inpt type
-#ifndef USE_CUDA_EXPERIMENTAL
-  static_assert(
-      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::context>,
-                     sycl::detail::interop<Backend, sycl::context>::type>);
-#endif
-  static_assert(
-      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::queue>,
-                     sycl::detail::interop<Backend, sycl::queue>::type>);
-
-  return 0;
-}
diff --git a/sycl/test/basic_tests/is_group_trait.cpp b/sycl/test/basic_tests/is_group_trait.cpp
new file mode 100644
index 0000000000000..60e7dd349ac8f
--- /dev/null
+++ b/sycl/test/basic_tests/is_group_trait.cpp
@@ -0,0 +1,22 @@
+// RUN: %clangxx -fsycl %s
+
+#include <sycl/sycl.hpp>
+
+template <typename T, typename ExpectedBaseType> void Check() {
+  static_assert(std::is_base_of_v<ExpectedBaseType, sycl::is_group<T>>);
+  static_assert(sycl::is_group<T>::value == ExpectedBaseType::value);
+  static_assert(sycl::is_group_v<T> == ExpectedBaseType::value);
+}
+
+int main() {
+  Check<sycl::group<1>, std::true_type>();
+  Check<sycl::group<2>, std::true_type>();
+  Check<sycl::group<3>, std::true_type>();
+  Check<sycl::sub_group, std::true_type>();
+
+  Check<int, std::false_type>();
+  Check<sycl::queue, std::false_type>();
+  Check<sycl::device, std::false_type>();
+
+  return 0;
+}
diff --git a/sycl/test/basic_tests/reduction/reducer_copy_move.cpp b/sycl/test/basic_tests/reduction/reducer_copy_move.cpp
new file mode 100644
index 0000000000000..707a880f51979
--- /dev/null
+++ b/sycl/test/basic_tests/reduction/reducer_copy_move.cpp
@@ -0,0 +1,79 @@
+// RUN: %clangxx -fsycl -fsyntax-only %s
+
+// Tests that the reducer class is neither movable nor copyable.
+
+#include <sycl/sycl.hpp>
+
+#include <type_traits>
+
+template <class T> struct PlusWithoutIdentity {
+  T operator()(const T &A, const T &B) const { return A + B; }
+};
+
+template <typename ReducerT> static constexpr void checkReducer() {
+  static_assert(!std::is_copy_constructible_v<ReducerT>);
+  static_assert(!std::is_move_constructible_v<ReducerT>);
+  static_assert(!std::is_copy_assignable_v<ReducerT>);
+  static_assert(!std::is_move_assignable_v<ReducerT>);
+}
+
+int main() {
+  sycl::queue Q;
+
+  int *ScalarMem = sycl::malloc_shared<int>(1, Q);
+  int *SpanMem = sycl::malloc_shared<int>(8, Q);
+  auto ScalarRed1 = sycl::reduction(ScalarMem, std::plus<int>{});
+  auto ScalarRed2 = sycl::reduction(ScalarMem, PlusWithoutIdentity<int>{});
+  auto SpanRed1 =
+      sycl::reduction(sycl::span<int, 8>{SpanMem, 8}, std::plus<int>{});
+  auto SpanRed2 = sycl::reduction(sycl::span<int, 8>{SpanMem, 8},
+                                  PlusWithoutIdentity<int>{});
+
+  Q.parallel_for(sycl::range<1>{1024}, ScalarRed1,
+                 [=](sycl::item<1>, auto &Reducer) {
+                   checkReducer<std::remove_reference_t<decltype(Reducer)>>();
+                 });
+
+  Q.parallel_for(sycl::nd_range<1>{1024, 1024}, ScalarRed1,
+                 [=](sycl::nd_item<1>, auto &Reducer) {
+                   checkReducer<std::remove_reference_t<decltype(Reducer)>>();
+                 });
+
+  Q.parallel_for(sycl::range<1>{1024}, ScalarRed2,
+                 [=](sycl::item<1>, auto &Reducer) {
+                   checkReducer<std::remove_reference_t<decltype(Reducer)>>();
+                 });
+
+  Q.parallel_for(sycl::nd_range<1>{1024, 1024}, ScalarRed2,
+                 [=](sycl::nd_item<1>, auto &Reducer) {
+                   checkReducer<std::remove_reference_t<decltype(Reducer)>>();
+                 });
+
+  Q.parallel_for(
+      sycl::range<1>{1024}, SpanRed1, [=](sycl::item<1>, auto &Reducer) {
+        checkReducer<std::remove_reference_t<decltype(Reducer)>>();
+        checkReducer<std::remove_reference_t<decltype(Reducer[0])>>();
+      });
+
+  Q.parallel_for(
+      sycl::nd_range<1>{1024, 1024}, SpanRed1,
+      [=](sycl::nd_item<1>, auto &Reducer) {
+        checkReducer<std::remove_reference_t<decltype(Reducer)>>();
+        checkReducer<std::remove_reference_t<decltype(Reducer[0])>>();
+      });
+
+  Q.parallel_for(
+      sycl::range<1>{1024}, SpanRed2, [=](sycl::item<1>, auto &Reducer) {
+        checkReducer<std::remove_reference_t<decltype(Reducer)>>();
+        checkReducer<std::remove_reference_t<decltype(Reducer[0])>>();
+      });
+
+  Q.parallel_for(
+      sycl::nd_range<1>{1024, 1024}, SpanRed2,
+      [=](sycl::nd_item<1>, auto &Reducer) {
+        checkReducer<std::remove_reference_t<decltype(Reducer)>>();
+        checkReducer<std::remove_reference_t<decltype(Reducer[0])>>();
+      });
+
+  return 0;
+}
diff --git a/sycl/test/basic_tests/reduction/reducer_copy_move_negative.cpp b/sycl/test/basic_tests/reduction/reducer_copy_move_negative.cpp
new file mode 100644
index 0000000000000..32b1d08e50f53
--- /dev/null
+++ b/sycl/test/basic_tests/reduction/reducer_copy_move_negative.cpp
@@ -0,0 +1,221 @@
+// RUN: %clangxx -fsycl -fsyntax-only -ferror-limit=0 -Xclang -verify %s -Xclang -verify-ignore-unexpected=note
+
+// Tests the errors emitted from using the deleted copy and move assignment
+// operators and constructors.
+
+#include <sycl/sycl.hpp>
+
+#include <type_traits>
+
+template <class T> struct PlusWithoutIdentity {
+  T operator()(const T &A, const T &B) const { return A + B; }
+};
+
+int main() {
+  sycl::queue Q;
+
+  int *ScalarMem = sycl::malloc_shared<int>(1, Q);
+  int *SpanMem = sycl::malloc_shared<int>(8, Q);
+  auto ScalarRed1 = sycl::reduction(ScalarMem, std::plus<int>{});
+  auto ScalarRed2 = sycl::reduction(ScalarMem, PlusWithoutIdentity<int>{});
+  auto SpanRed1 =
+      sycl::reduction(sycl::span<int, 8>{SpanMem, 8}, std::plus<int>{});
+  auto SpanRed2 = sycl::reduction(sycl::span<int, 8>{SpanMem, 8},
+                                  PlusWithoutIdentity<int>{});
+
+  // expected-error-re@sycl/reduction.hpp:* {{call to deleted constructor of 'sycl::reducer<int, std::plus<int>, 0{{.*}}>'}}
+  Q.parallel_for(sycl::range<1>{1024}, ScalarRed1,
+                 [=](sycl::item<1>, auto Reducer) {});
+
+  // expected-error-re@sycl/reduction.hpp:* {{call to deleted constructor of 'sycl::reducer<int, std::plus<int>, 0{{.*}}>'}}
+  Q.parallel_for(sycl::nd_range<1>{1024, 1024}, ScalarRed1,
+                 [=](sycl::nd_item<1>, auto Reducer) {});
+
+  // expected-error-re@sycl/reduction.hpp:* {{call to deleted constructor of 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>'}}
+  Q.parallel_for(sycl::range<1>{1024}, ScalarRed2,
+                 [=](sycl::item<1>, auto Reducer) {});
+
+  // expected-error-re@sycl/reduction.hpp:* {{call to deleted constructor of 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>'}}
+  Q.parallel_for(sycl::nd_range<1>{1024, 1024}, ScalarRed2,
+                 [=](sycl::nd_item<1>, auto Reducer) {});
+
+  // expected-error-re@sycl/reduction.hpp:* {{call to deleted constructor of 'sycl::reducer<int, std::plus<int>, 1{{.*}}>'}}
+  Q.parallel_for(sycl::range<1>{1024}, SpanRed1,
+                 [=](sycl::item<1>, auto Reducer) {});
+
+  // expected-error-re@sycl/reduction.hpp:* {{call to deleted constructor of 'sycl::reducer<int, std::plus<int>, 1{{.*}}>'}}
+  Q.parallel_for(sycl::nd_range<1>{1024, 1024}, SpanRed1,
+                 [=](sycl::nd_item<1>, auto Reducer) {});
+
+  // expected-error-re@sycl/reduction.hpp:* {{call to deleted constructor of 'sycl::reducer<int, PlusWithoutIdentity<int>, 1{{.*}}>'}}
+  Q.parallel_for(sycl::range<1>{1024}, SpanRed2,
+                 [=](sycl::item<1>, auto Reducer) {});
+
+  // expected-error-re@sycl/reduction.hpp:* {{call to deleted constructor of 'sycl::reducer<int, PlusWithoutIdentity<int>, 1{{.*}}>'}}
+  Q.parallel_for(sycl::nd_range<1>{1024, 1024}, SpanRed2,
+                 [=](sycl::nd_item<1>, auto Reducer) {});
+
+  Q.parallel_for(sycl::range<1>{1024}, ScalarRed1,
+                 [=](sycl::item<1>, auto &Reducer) {
+                   using reducer_t = std::remove_reference_t<decltype(Reducer)>;
+                   // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, std::plus<int>, 0{{.*}}>')}}
+                   reducer_t ReducerCopyAssign = Reducer;
+                   // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, std::plus<int>, 0{{.*}}>')}}
+                   reducer_t ReducerMoveAssign = std::move(Reducer);
+                   // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, std::plus<int>, 0{{.*}}>')}}
+                   reducer_t ReducerCopyCtor{Reducer};
+                   // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, std::plus<int>, 0{{.*}}>')}}
+                   reducer_t ReducerMoveCtor{std::move(Reducer)};
+                 });
+
+  Q.parallel_for(sycl::nd_range<1>{1024, 1024}, ScalarRed1,
+                 [=](sycl::nd_item<1>, auto &Reducer) {
+                   using reducer_t = std::remove_reference_t<decltype(Reducer)>;
+                   // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, std::plus<int>, 0{{.*}}>')}}
+                   reducer_t ReducerCopyAssign = Reducer;
+                   // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, std::plus<int>, 0{{.*}}>')}}
+                   reducer_t ReducerMoveAssign = std::move(Reducer);
+                   // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, std::plus<int>, 0{{.*}}>')}}
+                   reducer_t ReducerCopyCtor{Reducer};
+                   // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, std::plus<int>, 0{{.*}}>')}}
+                   reducer_t ReducerMoveCtor{std::move(Reducer)};
+                 });
+
+  Q.parallel_for(sycl::range<1>{1024}, ScalarRed2,
+                 [=](sycl::item<1>, auto &Reducer) {
+                   using reducer_t = std::remove_reference_t<decltype(Reducer)>;
+                   // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>')}}
+                   reducer_t ReducerCopyAssign = Reducer;
+                   // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>')}}
+                   reducer_t ReducerMoveAssign = std::move(Reducer);
+                   // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>')}}
+                   reducer_t ReducerCopyCtor{Reducer};
+                   // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>')}}
+                   reducer_t ReducerMoveCtor{std::move(Reducer)};
+                 });
+
+  Q.parallel_for(sycl::nd_range<1>{1024, 1024}, ScalarRed2,
+                 [=](sycl::nd_item<1>, auto &Reducer) {
+                   using reducer_t = std::remove_reference_t<decltype(Reducer)>;
+                   // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>')}}
+                   reducer_t ReducerCopyAssign = Reducer;
+                   // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>')}}
+                   reducer_t ReducerMoveAssign = std::move(Reducer);
+                   // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>')}}
+                   reducer_t ReducerCopyCtor{Reducer};
+                   // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>')}}
+                   reducer_t ReducerMoveCtor{std::move(Reducer)};
+                 });
+
+  Q.parallel_for(
+      sycl::range<1>{1024}, SpanRed1, [=](sycl::item<1>, auto &Reducer) {
+        using reducer_t = std::remove_reference_t<decltype(Reducer)>;
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, std::plus<int>, 1{{.*}}>')}}
+        reducer_t ReducerCopyAssign = Reducer;
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, std::plus<int>, 1{{.*}}>')}}
+        reducer_t ReducerMoveAssign = std::move(Reducer);
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, std::plus<int>, 1{{.*}}>')}}
+        reducer_t ReducerCopyCtor{Reducer};
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, std::plus<int>, 1{{.*}}>')}}
+        reducer_t ReducerMoveCtor{std::move(Reducer)};
+
+        using reducer_subscript_t =
+            std::remove_reference_t<decltype(Reducer[0])>;
+        reducer_subscript_t ReducerSubscript = Reducer[0];
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer<int, std::plus<int>, 0{{.*}}>')}}
+        reducer_subscript_t ReducerSubscriptCopyAssign = ReducerSubscript;
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer<int, std::plus<int>, 0{{.*}}>')}}
+        reducer_subscript_t ReducerSubscriptMoveAssign =
+            std::move(ReducerSubscript);
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer<int, std::plus<int>, 0{{.*}}>')}}
+        reducer_subscript_t ReducerSubscriptCopyCtor{ReducerSubscript};
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer<int, std::plus<int>, 0{{.*}}>')}}
+        reducer_subscript_t ReducerSubscriptMoveCtor{
+            std::move(ReducerSubscript)};
+      });
+
+  Q.parallel_for(
+      sycl::nd_range<1>{1024, 1024}, SpanRed1,
+      [=](sycl::nd_item<1>, auto &Reducer) {
+        using reducer_t = std::remove_reference_t<decltype(Reducer)>;
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, std::plus<int>, 1{{.*}}>')}}
+        reducer_t ReducerCopyAssign = Reducer;
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, std::plus<int>, 1{{.*}}>')}}
+        reducer_t ReducerMoveAssign = std::move(Reducer);
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, std::plus<int>, 1{{.*}}>')}}
+        reducer_t ReducerCopyCtor{Reducer};
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, std::plus<int>, 1{{.*}}>')}}
+        reducer_t ReducerMoveCtor{std::move(Reducer)};
+
+        using reducer_subscript_t =
+            std::remove_reference_t<decltype(Reducer[0])>;
+        reducer_subscript_t ReducerSubscript = Reducer[0];
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer<int, std::plus<int>, 0{{.*}}>')}}
+        reducer_subscript_t ReducerSubscriptCopyAssign = ReducerSubscript;
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer<int, std::plus<int>, 0{{.*}}>')}}
+        reducer_subscript_t ReducerSubscriptMoveAssign =
+            std::move(ReducerSubscript);
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer<int, std::plus<int>, 0{{.*}}>')}}
+        reducer_subscript_t ReducerSubscriptCopyCtor{ReducerSubscript};
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer<int, std::plus<int>, 0{{.*}}>')}}
+        reducer_subscript_t ReducerSubscriptMoveCtor{
+            std::move(ReducerSubscript)};
+      });
+
+  Q.parallel_for(
+      sycl::range<1>{1024}, SpanRed2, [=](sycl::item<1>, auto &Reducer) {
+        using reducer_t = std::remove_reference_t<decltype(Reducer)>;
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 1{{.*}}>')}}
+        reducer_t ReducerCopyAssign = Reducer;
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 1{{.*}}>')}}
+        reducer_t ReducerMoveAssign = std::move(Reducer);
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 1{{.*}}>')}}
+        reducer_t ReducerCopyCtor{Reducer};
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 1{{.*}}>')}}
+        reducer_t ReducerMoveCtor{std::move(Reducer)};
+
+        using reducer_subscript_t =
+            std::remove_reference_t<decltype(Reducer[0])>;
+        reducer_subscript_t ReducerSubscript = Reducer[0];
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>')}}
+        reducer_subscript_t ReducerSubscriptCopyAssign = ReducerSubscript;
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>')}}
+        reducer_subscript_t ReducerSubscriptMoveAssign =
+            std::move(ReducerSubscript);
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>')}}
+        reducer_subscript_t ReducerSubscriptCopyCtor{ReducerSubscript};
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>')}}
+        reducer_subscript_t ReducerSubscriptMoveCtor{
+            std::move(ReducerSubscript)};
+      });
+
+  Q.parallel_for(
+      sycl::nd_range<1>{1024, 1024}, SpanRed2,
+      [=](sycl::nd_item<1>, auto &Reducer) {
+        using reducer_t = std::remove_reference_t<decltype(Reducer)>;
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 1{{.*}}>')}}
+        reducer_t ReducerCopyAssign = Reducer;
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 1{{.*}}>')}}
+        reducer_t ReducerMoveAssign = std::move(Reducer);
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 1{{.*}}>')}}
+        reducer_t ReducerCopyCtor{Reducer};
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 1{{.*}}>')}}
+        reducer_t ReducerMoveCtor{std::move(Reducer)};
+
+        using reducer_subscript_t =
+            std::remove_reference_t<decltype(Reducer[0])>;
+        reducer_subscript_t ReducerSubscript = Reducer[0];
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>')}}
+        reducer_subscript_t ReducerSubscriptCopyAssign = ReducerSubscript;
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>')}}
+        reducer_subscript_t ReducerSubscriptMoveAssign =
+            std::move(ReducerSubscript);
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>')}}
+        reducer_subscript_t ReducerSubscriptCopyCtor{ReducerSubscript};
+        // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer<int, PlusWithoutIdentity<int>, 0{{.*}}>')}}
+        reducer_subscript_t ReducerSubscriptMoveCtor{
+            std::move(ReducerSubscript)};
+      });
+
+  return 0;
+}
diff --git a/sycl/test/basic_tests/relational_builtins.cpp b/sycl/test/basic_tests/relational_builtins.cpp
index 88b97d3980412..33f3ddb671d54 100644
--- a/sycl/test/basic_tests/relational_builtins.cpp
+++ b/sycl/test/basic_tests/relational_builtins.cpp
@@ -280,28 +280,28 @@ void foo() {
 
   // any
   CHECK(int, bool, any, int16_t)
-  CHECK(int, bool, any, int16v)
+  CHECK(int, int, any, int16v)
   CHECK2020(_, bool, any, int16m)
 
   CHECK(int, bool, any, int32_t)
-  CHECK(int, bool, any, int32v)
+  CHECK(int, int, any, int32v)
   CHECK2020(_, bool, any, int32m)
 
   CHECK(int, bool, any, int64_t)
-  CHECK(int, bool, any, int64v)
+  CHECK(int, int, any, int64v)
   CHECK2020(_, bool, any, int64m)
 
   // all
   CHECK(int, bool, all, int16_t)
-  CHECK(int, bool, all, int16v)
+  CHECK(int, int, all, int16v)
   CHECK2020(_, bool, all, int16m)
 
   CHECK(int, bool, all, int32_t)
-  CHECK(int, bool, all, int32v)
+  CHECK(int, int, all, int32v)
   CHECK2020(_, bool, all, int32m)
 
   CHECK(int, bool, all, int64_t)
-  CHECK(int, bool, all, int64v)
+  CHECK(int, int, all, int64v)
   CHECK2020(_, bool, all, int64m)
 
   // bitselect
diff --git a/sycl/test/check_device_code/accessor_index.cpp b/sycl/test/check_device_code/accessor_index.cpp
new file mode 100644
index 0000000000000..04d248e27ccc3
--- /dev/null
+++ b/sycl/test/check_device_code/accessor_index.cpp
@@ -0,0 +1,17 @@
+// RUN: %clangxx -fsycl-device-only -fno-sycl-early-optimizations -S -emit-llvm -D__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ -o - %s | FileCheck %s
+#include <sycl/sycl.hpp>
+
+// Check that accessor index calculation is unrolled in headers.
+// CHECK-NOT: llvm.loop
+// CHECK-NOT: br i1
+using namespace sycl;
+int main() {
+  queue Q;
+  range<3> Range{8, 8, 8};
+  buffer<int, 3> Buf(Range);
+  Q.submit([&](handler &Cgh) {
+    auto Acc = Buf.get_access<access::mode::write>(Cgh);
+    local_accessor<int, 3> LocAcc(Range, Cgh);
+    Cgh.parallel_for(Range, [=](item<3> It) { LocAcc[It] = Acc[It]; });
+  });
+}
diff --git a/sycl/test/esimd/intrins_trans.cpp b/sycl/test/esimd/intrins_trans.cpp
index 7f5b4224c9e92..2ac31f54b058e 100644
--- a/sycl/test/esimd/intrins_trans.cpp
+++ b/sycl/test/esimd/intrins_trans.cpp
@@ -302,4 +302,12 @@ SYCL_EXTERNAL void test_math_intrins() SYCL_ESIMD_FUNCTION {
     // CHECK-LABEL: %{{[a-zA-Z0-9.]+}} = call <8 x float> @llvm.genx.ieee.sqrt.v8f32(<8 x float> %{{[a-zA-Z0-9.]+}})
     use(y);
   }
+  {
+    vec<int, 8> x0 = get8i();
+    vec<int, 8> x1 = get8i();
+    vec<int, 8> x2 = get8i();
+    auto res = __esimd_bfn<0xff, int, 8>(x0, x1, x2);
+    // CHECK-LABEL: %{{[a-zA-Z0-9.]+}} = call <8 x i32> @llvm.genx.bfn.v8i32.v8i32(<8 x i32> %{{[a-zA-Z0-9.]+}}, <8 x i32> %{{[a-zA-Z0-9.]+}}, <8 x i32> %{{[a-zA-Z0-9.]+}}, i8 -1)
+    use(res);
+  }
 }
diff --git a/sycl/test/esimd/math_impl.cpp b/sycl/test/esimd/math_impl.cpp
index 44901f7d04195..637462ae6dbbb 100644
--- a/sycl/test/esimd/math_impl.cpp
+++ b/sycl/test/esimd/math_impl.cpp
@@ -10,6 +10,7 @@
 using namespace sycl;
 using namespace sycl::ext::intel;
 using namespace sycl::ext::intel::esimd;
+using namespace sycl::ext::intel::experimental::esimd;
 
 // Math sin,cos,log,exp functions are translated into scalar __spirv_ocl_ calls
 SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd<float, 16> sycl_math(simd<float, 16> x) {
@@ -52,3 +53,13 @@ esimd_math_emu(simd<float, 16> x) {
   v = esimd::exp(v);
   return v;
 }
+
+// Logical BNF function from esimd namespace is translated into __esimd_ calls,
+// which later translate into GenX intrinsics.
+SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd<int, 16>
+esimd_bfn(simd<int, 16> x, simd<int, 16> y, simd<int, 16> z) {
+  simd<int, 16> v =
+      experimental::esimd::bfn<~bfn_t::x & ~bfn_t::y & ~bfn_t::z>(x, y, z);
+  //CHECK: call spir_func noundef <16 x i32> @_Z11__esimd_bfn
+  return v;
+}
diff --git a/sycl/test/invoke_simd/return-type-mismatch-error.cpp b/sycl/test/invoke_simd/return-type-mismatch-error.cpp
new file mode 100644
index 0000000000000..2e8907d27f912
--- /dev/null
+++ b/sycl/test/invoke_simd/return-type-mismatch-error.cpp
@@ -0,0 +1,30 @@
+// RUN: not %clangxx -fsycl -fsycl-device-only -S %s -o /dev/null 2>&1 | FileCheck %s
+#include <sycl/ext/oneapi/experimental/invoke_simd.hpp>
+#include <sycl/sycl.hpp>
+
+using namespace sycl::ext::oneapi::experimental;
+using namespace sycl;
+namespace esimd = sycl::ext::intel::esimd;
+
+[[intel::device_indirectly_callable]] simd<int, 4> callee(simd<int, 8>) {
+  return simd<int, 4>();
+}
+
+void foo() {
+  constexpr unsigned Size = 1024;
+  constexpr unsigned GroupSize = 64;
+  sycl::range<1> GlobalRange{Size};
+  sycl::range<1> LocalRange{GroupSize};
+  sycl::nd_range<1> Range(GlobalRange, LocalRange);
+  queue q;
+  auto e = q.submit([&](handler &cgh) {
+    cgh.parallel_for(Range, [=](nd_item<1> ndi) {
+      invoke_simd(ndi.get_sub_group(), callee, 0);
+    });
+  });
+}
+
+int main() {
+  foo();
+  // CHECK: {{.*}}error:{{.*}}static assertion failed due to requirement 'RetVecLength == 8': invoke_simd callee return type vector length must match kernel subgroup size{{.*}}
+}
diff --git a/sycl/test/lit.cfg.py b/sycl/test/lit.cfg.py
index b2fffb3c273b5..3ca752c46b182 100644
--- a/sycl/test/lit.cfg.py
+++ b/sycl/test/lit.cfg.py
@@ -96,7 +96,12 @@
 llvm_symbolizer = os.path.join(config.llvm_build_bin_dir, 'llvm-symbolizer')
 llvm_config.with_environment('LLVM_SYMBOLIZER_PATH', llvm_symbolizer)
 
-config.substitutions.append( ('%fsycl-host-only', '-std=c++17 -Xclang -fsycl-is-host -isystem %s -isystem %s -isystem %s -isystem %s' % (config.sycl_include, config.level_zero_include_dir, config.opencl_include_dir, config.sycl_include + '/sycl/') ) )
+sycl_host_only_options = '-std=c++17 -Xclang -fsycl-is-host'
+for include_dir in [config.sycl_include, config.level_zero_include_dir, config.opencl_include_dir, config.sycl_include + '/sycl/']:
+    if include_dir:
+        sycl_host_only_options += ' -isystem %s' % include_dir
+config.substitutions.append( ('%fsycl-host-only', sycl_host_only_options) )
+
 config.substitutions.append( ('%sycl_lib', ' -lsycl6' if platform.system() == "Windows" else '-lsycl') )
 
 llvm_config.add_tool_substitutions(['llvm-spirv'], [config.sycl_tools_dir])
@@ -116,6 +121,12 @@
 if config.esimd_emulator_be == "ON":
     config.available_features.add('esimd_emulator_be')
 
+if config.opencl_be == "ON":
+    config.available_features.add('opencl_be')
+
+if config.level_zero_be == "ON":
+    config.available_features.add('level_zero_be')
+
 if triple == 'nvptx64-nvidia-cuda':
     llvm_config.with_system_environment('CUDA_PATH')
     config.available_features.add('cuda')
diff --git a/sycl/test/lit.site.cfg.py.in b/sycl/test/lit.site.cfg.py.in
index d5a6317e2e23c..0e0f031df2ed4 100644
--- a/sycl/test/lit.site.cfg.py.in
+++ b/sycl/test/lit.site.cfg.py.in
@@ -28,6 +28,8 @@ config.extra_environment = lit_config.params.get("extra_environment", "@LIT_EXTR
 config.cuda_be = '@SYCL_BUILD_PI_CUDA@'
 config.esimd_emulator_be = '@SYCL_BUILD_PI_ESIMD_EMULATOR@'
 config.hip_be = '@SYCL_BUILD_PI_HIP@'
+config.opencl_be = '@SYCL_BUILD_PI_OPENCL@'
+config.level_zero_be = '@SYCL_BUILD_PI_LEVEL_ZERO@'
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
diff --git a/sycl/test/optional_kernel_features/relaxed_fp64_propagation.cpp b/sycl/test/optional_kernel_features/relaxed_fp64_propagation.cpp
new file mode 100644
index 0000000000000..b354702ca47de
--- /dev/null
+++ b/sycl/test/optional_kernel_features/relaxed_fp64_propagation.cpp
@@ -0,0 +1,24 @@
+// RUN: %clangxx %s -S -o %t_opt.ll -fsycl-device-only -Xclang -verify -Xclang -verify-ignore-unexpected=note
+// RUN: FileCheck %s --input-file %t_opt.ll --check-prefix=CHECK-OPT
+// RUN: %clangxx %s -S -fno-sycl-early-optimizations -o %t_noopt.ll -fsycl-device-only -Xclang -verify -Xclang -verify-ignore-unexpected=note
+// RUN: FileCheck %s --input-file %t_noopt.ll --check-prefix=CHECK-NOOPT
+
+// Tests that an optimization that removes the use of double still produces a
+// warning.
+
+// CHECK-OPT-NOT: double
+// CHECK-NOOPT: double
+
+#include <sycl/sycl.hpp>
+
+int main() {
+  sycl::queue Q;
+  // expected-warning-re@+1 {{function '{{.*}}' uses aspect 'fp64' not listed in its 'sycl::device_has' attribute}}
+  Q.single_task([=]() [[sycl::device_has()]] {
+    // Double will be optimized out as LoweredFloat can be set directly to a
+    // lowered value.
+    double Double = 3.14;
+    volatile float LoweredFloat = Double;
+  });
+  return 0;
+}
diff --git a/sycl/test/regression/vec_init_list_ctor.cpp b/sycl/test/regression/vec_init_list_ctor.cpp
new file mode 100644
index 0000000000000..740075bc0fe80
--- /dev/null
+++ b/sycl/test/regression/vec_init_list_ctor.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang -fsycl -fsyntax-only -Xclang -verify %s
+// expected-no-diagnostics
+
+// Regression test checking that the vector ctor taking an initializer list
+// doesn't cause warnings or errors.
+
+#include <sycl/sycl.hpp>
+
+int main() {
+  sycl::vec<int, 2> V({1, 2});
+  return 0;
+}
diff --git a/sycl/unittests/Extensions/DeviceGlobal.cpp b/sycl/unittests/Extensions/DeviceGlobal.cpp
index 4d47500507502..dc34b822a1013 100644
--- a/sycl/unittests/Extensions/DeviceGlobal.cpp
+++ b/sycl/unittests/Extensions/DeviceGlobal.cpp
@@ -12,6 +12,7 @@
 #include "detail/context_impl.hpp"
 #include "detail/kernel_program_cache.hpp"
 
+#include <helpers/MockKernelInfo.hpp>
 #include <helpers/PiImage.hpp>
 #include <helpers/PiMock.hpp>
 
@@ -42,31 +43,17 @@ sycl::ext::oneapi::experimental::device_global<
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
-template <> struct KernelInfo<DeviceGlobalTestKernel> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<DeviceGlobalTestKernel>
+    : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return DeviceGlobalTestKernelName; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
-template <> struct KernelInfo<DeviceGlobalImgScopeTestKernel> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<DeviceGlobalImgScopeTestKernel>
+    : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() {
     return DeviceGlobalImgScopeTestKernelName;
   }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 } // namespace detail
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
diff --git a/sycl/unittests/Extensions/USMMemcpy2D.cpp b/sycl/unittests/Extensions/USMMemcpy2D.cpp
index 8139f6ae24edd..ffdaa86ff95a6 100644
--- a/sycl/unittests/Extensions/USMMemcpy2D.cpp
+++ b/sycl/unittests/Extensions/USMMemcpy2D.cpp
@@ -10,6 +10,7 @@
 
 #include <detail/queue_impl.hpp>
 
+#include <helpers/MockKernelInfo.hpp>
 #include <helpers/PiImage.hpp>
 #include <helpers/PiMock.hpp>
 
@@ -23,7 +24,9 @@ constexpr const char *USMMemcpyHelperKernelNameChar = "__usmmemcpy2d_char";
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
-template <> struct KernelInfo<class __usmfill2d<long>> {
+template <>
+struct KernelInfo<class __usmfill2d<long>>
+    : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return USMFillHelperKernelNameLong; }
   static constexpr unsigned getNumParams() { return 7; }
   static const kernel_param_desc_t &getParamDesc(int Idx) {
@@ -39,15 +42,14 @@ template <> struct KernelInfo<class __usmfill2d<long>> {
     };
     return DummySignature[Idx];
   }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
   static constexpr int64_t getKernelSize() {
     return 2 * sizeof(void *) + 2 * sizeof(sycl::id<2>) + 3 * sizeof(size_t);
   }
 };
 
-template <> struct KernelInfo<class __usmfill2d<unsigned char>> {
+template <>
+struct KernelInfo<class __usmfill2d<unsigned char>>
+    : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return USMFillHelperKernelNameChar; }
   static constexpr unsigned getNumParams() { return 7; }
   static const kernel_param_desc_t &getParamDesc(int Idx) {
@@ -63,15 +65,14 @@ template <> struct KernelInfo<class __usmfill2d<unsigned char>> {
     };
     return DummySignature[Idx];
   }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
   static constexpr int64_t getKernelSize() {
     return 2 * sizeof(void *) + 2 * sizeof(sycl::id<2>) + 3 * sizeof(size_t);
   }
 };
 
-template <> struct KernelInfo<class __usmmemcpy2d<long>> {
+template <>
+struct KernelInfo<class __usmmemcpy2d<long>>
+    : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() {
     return USMMemcpyHelperKernelNameLong;
   }
@@ -90,15 +91,14 @@ template <> struct KernelInfo<class __usmmemcpy2d<long>> {
     };
     return DummySignature[Idx];
   }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
   static constexpr int64_t getKernelSize() {
     return 2 * sizeof(void *) + 2 * sizeof(sycl::id<2>) + 4 * sizeof(size_t);
   }
 };
 
-template <> struct KernelInfo<class __usmmemcpy2d<unsigned char>> {
+template <>
+struct KernelInfo<class __usmmemcpy2d<unsigned char>>
+    : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() {
     return USMMemcpyHelperKernelNameChar;
   }
@@ -117,9 +117,6 @@ template <> struct KernelInfo<class __usmmemcpy2d<unsigned char>> {
     };
     return DummySignature[Idx];
   }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
   static constexpr int64_t getKernelSize() {
     return 2 * sizeof(void *) + 2 * sizeof(sycl::id<2>) + 4 * sizeof(size_t);
   }
diff --git a/sycl/unittests/Extensions/WeakObject.cpp b/sycl/unittests/Extensions/WeakObject.cpp
index 2357ab5febf8a..8af9974bdba4a 100644
--- a/sycl/unittests/Extensions/WeakObject.cpp
+++ b/sycl/unittests/Extensions/WeakObject.cpp
@@ -198,6 +198,39 @@ template <typename SyclObjT> struct WeakObjectCheckOwnerLessMap {
   }
 };
 
+template <typename SyclObjT> struct WeakObjectCheckCopy {
+  void operator()(SyclObjT Obj) {
+    sycl::ext::oneapi::weak_object<SyclObjT> WeakObj{Obj};
+
+    sycl::ext::oneapi::weak_object<SyclObjT> WeakObjCopyCtor{WeakObj};
+    sycl::ext::oneapi::weak_object<SyclObjT> WeakObjCopyAssign = WeakObj;
+
+    EXPECT_FALSE(WeakObjCopyCtor.expired());
+    EXPECT_FALSE(WeakObjCopyAssign.expired());
+
+    EXPECT_TRUE(WeakObjCopyCtor.lock() == Obj);
+    EXPECT_TRUE(WeakObjCopyAssign.lock() == Obj);
+  }
+};
+
+template <typename SyclObjT> struct WeakObjectCheckMove {
+  void operator()(SyclObjT Obj) {
+    sycl::ext::oneapi::weak_object<SyclObjT> WeakObj1{Obj};
+    sycl::ext::oneapi::weak_object<SyclObjT> WeakObj2{Obj};
+
+    sycl::ext::oneapi::weak_object<SyclObjT> WeakObjMoveCtor{
+        std::move(WeakObj1)};
+    sycl::ext::oneapi::weak_object<SyclObjT> WeakObjMoveAssign =
+        std::move(WeakObj2);
+
+    EXPECT_FALSE(WeakObjMoveCtor.expired());
+    EXPECT_FALSE(WeakObjMoveAssign.expired());
+
+    EXPECT_TRUE(WeakObjMoveCtor.lock() == Obj);
+    EXPECT_TRUE(WeakObjMoveAssign.lock() == Obj);
+  }
+};
+
 template <template <typename> typename CallableT>
 void runTest(sycl::unittest::PiMock &Mock) {
   sycl::platform Plt = Mock.getPlatform();
@@ -385,3 +418,13 @@ TEST(WeakObjectTest, WeakObjectOwnerLessMap) {
   sycl::unittest::PiMock Mock;
   runTestMulti<WeakObjectCheckOwnerLessMap>(Mock);
 }
+
+TEST(WeakObjectTest, WeakObjectCopy) {
+  sycl::unittest::PiMock Mock;
+  runTest<WeakObjectCheckCopy>(Mock);
+}
+
+TEST(WeakObjectTest, WeakObjectMove) {
+  sycl::unittest::PiMock Mock;
+  runTest<WeakObjectCheckMove>(Mock);
+}
diff --git a/sycl/unittests/SYCL2020/AtomicMemoryOrderCapabilities.cpp b/sycl/unittests/SYCL2020/AtomicMemoryOrderCapabilities.cpp
new file mode 100644
index 0000000000000..6ed291c0da3e8
--- /dev/null
+++ b/sycl/unittests/SYCL2020/AtomicMemoryOrderCapabilities.cpp
@@ -0,0 +1,65 @@
+//==---- AtomicMemoryOrderCapabilities.cpp --- memory order query test -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <helpers/PiMock.hpp>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+namespace {
+
+static constexpr size_t expectedCapabilityVecSize = 5;
+static thread_local bool deviceGetInfoCalled = false;
+
+static bool has_capability(const std::vector<memory_order> &deviceCapabilities,
+                           memory_order capabilityToFind) {
+  return std::find(deviceCapabilities.begin(), deviceCapabilities.end(),
+                   capabilityToFind) != deviceCapabilities.end();
+}
+
+pi_result redefinedDeviceGetInfo(pi_device device, pi_device_info param_name,
+                                 size_t param_value_size, void *param_value,
+                                 size_t *param_value_size_ret) {
+  if (param_name == PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES) {
+    deviceGetInfoCalled = true;
+    if (param_value) {
+      pi_memory_order_capabilities *Capabilities =
+          reinterpret_cast<pi_memory_order_capabilities *>(param_value);
+      *Capabilities = PI_MEMORY_ORDER_RELAXED | PI_MEMORY_ORDER_ACQUIRE |
+                      PI_MEMORY_ORDER_RELEASE | PI_MEMORY_ORDER_ACQ_REL |
+                      PI_MEMORY_ORDER_SEQ_CST;
+    }
+  }
+  return PI_SUCCESS;
+}
+
+TEST(AtomicMemoryOrderCapabilities, DeviceQueryReturnsCorrectCapabilities) {
+  unittest::PiMock Mock;
+  platform Plt = Mock.getPlatform();
+
+  Mock.redefineAfter<detail::PiApiKind::piDeviceGetInfo>(
+      redefinedDeviceGetInfo);
+
+  const device Dev = Plt.get_devices()[0];
+  context Ctx{Dev};
+
+  auto Capabilities =
+      Dev.get_info<info::device::atomic_memory_order_capabilities>();
+  EXPECT_TRUE(deviceGetInfoCalled);
+  EXPECT_EQ(Capabilities.size(), expectedCapabilityVecSize);
+
+  EXPECT_TRUE(has_capability(Capabilities, memory_order::relaxed));
+  EXPECT_TRUE(has_capability(Capabilities, memory_order::acquire));
+  EXPECT_TRUE(has_capability(Capabilities, memory_order::release));
+  EXPECT_TRUE(has_capability(Capabilities, memory_order::acq_rel));
+  EXPECT_TRUE(has_capability(Capabilities, memory_order::seq_cst));
+}
+
+} // namespace
diff --git a/sycl/unittests/SYCL2020/AtomicMemoryScopeCapabilities.cpp b/sycl/unittests/SYCL2020/AtomicMemoryScopeCapabilities.cpp
new file mode 100644
index 0000000000000..776991b9e53bf
--- /dev/null
+++ b/sycl/unittests/SYCL2020/AtomicMemoryScopeCapabilities.cpp
@@ -0,0 +1,68 @@
+//==-------- AtomicMemoryScopeCapabilities.cpp --- queue unit tests --------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <gtest/gtest.h>
+#include <helpers/PiMock.hpp>
+
+using namespace sycl;
+
+namespace {
+
+thread_local bool deviceGetInfoCalled;
+
+pi_result redefinedDeviceGetInfoAfter(pi_device device,
+                                      pi_device_info param_name,
+                                      size_t param_value_size,
+                                      void *param_value,
+                                      size_t *param_value_size_ret) {
+  if (param_name == PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES) {
+    deviceGetInfoCalled = true;
+    if (param_value) {
+      auto *Result =
+          reinterpret_cast<pi_memory_scope_capabilities *>(param_value);
+      *Result = PI_MEMORY_SCOPE_WORK_ITEM | PI_MEMORY_SCOPE_SUB_GROUP |
+                PI_MEMORY_SCOPE_WORK_GROUP | PI_MEMORY_SCOPE_DEVICE |
+                PI_MEMORY_SCOPE_SYSTEM;
+    }
+  }
+  return PI_SUCCESS;
+}
+
+TEST(AtomicMemoryScopeCapabilitiesCheck, CheckAtomicMemoryScopeCapabilities) {
+  sycl::unittest::PiMock Mock;
+  sycl::platform Plt = Mock.getPlatform();
+  device Dev = Plt.get_devices()[0];
+
+  deviceGetInfoCalled = false;
+
+  Mock.redefineAfter<detail::PiApiKind::piDeviceGetInfo>(
+      redefinedDeviceGetInfoAfter);
+  auto scope_capabilities =
+      Dev.get_info<sycl::info::device::atomic_memory_scope_capabilities>();
+  EXPECT_TRUE(deviceGetInfoCalled);
+  size_t expectedSize = 5;
+  EXPECT_EQ(scope_capabilities.size(), expectedSize);
+
+  auto res = std::find(scope_capabilities.begin(), scope_capabilities.end(),
+                       sycl::memory_scope::work_item);
+  EXPECT_FALSE(res == scope_capabilities.end());
+  res = std::find(scope_capabilities.begin(), scope_capabilities.end(),
+                  sycl::memory_scope::sub_group);
+  EXPECT_FALSE(res == scope_capabilities.end());
+  res = std::find(scope_capabilities.begin(), scope_capabilities.end(),
+                  sycl::memory_scope::work_group);
+  EXPECT_FALSE(res == scope_capabilities.end());
+  res = std::find(scope_capabilities.begin(), scope_capabilities.end(),
+                  sycl::memory_scope::device);
+  EXPECT_FALSE(res == scope_capabilities.end());
+  res = std::find(scope_capabilities.begin(), scope_capabilities.end(),
+                  sycl::memory_scope::system);
+  EXPECT_FALSE(res == scope_capabilities.end());
+}
+} // anonymous namespace
diff --git a/sycl/unittests/SYCL2020/CMakeLists.txt b/sycl/unittests/SYCL2020/CMakeLists.txt
index 9e22f73abfa00..4e30050ffd088 100644
--- a/sycl/unittests/SYCL2020/CMakeLists.txt
+++ b/sycl/unittests/SYCL2020/CMakeLists.txt
@@ -4,10 +4,13 @@ add_sycl_unittest(SYCL2020Tests OBJECT
   GetNativeOpenCL.cpp
   SpecializationConstant.cpp
   KernelBundle.cpp
+  KernelBundleStateFiltering.cpp
   KernelID.cpp
   HasExtension.cpp
   IsCompatible.cpp
   DeviceGetInfoAspects.cpp
   DeviceAspectTraits.cpp
+  AtomicMemoryOrderCapabilities.cpp
+  AtomicMemoryScopeCapabilities.cpp
 )
 
diff --git a/sycl/unittests/SYCL2020/IsCompatible.cpp b/sycl/unittests/SYCL2020/IsCompatible.cpp
index 19a4d738af4f5..14214e0101b13 100644
--- a/sycl/unittests/SYCL2020/IsCompatible.cpp
+++ b/sycl/unittests/SYCL2020/IsCompatible.cpp
@@ -1,5 +1,6 @@
 #include <sycl/sycl.hpp>
 
+#include <helpers/MockKernelInfo.hpp>
 #include <helpers/PiImage.hpp>
 #include <helpers/PiMock.hpp>
 
@@ -16,141 +17,51 @@ class TestKernelACC;
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
-template <> struct KernelInfo<TestKernelCPU> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<TestKernelCPU> : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return "TestKernelCPU"; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
-} // namespace detail
-} // __SYCL_INLINE_VER_NAMESPACE(_V1)
-} // namespace sycl
-
-namespace sycl {
-__SYCL_INLINE_VER_NAMESPACE(_V1) {
-namespace detail {
-template <> struct KernelInfo<TestKernelCPUInvalidReqdWGSize1D> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
+template <>
+struct KernelInfo<TestKernelCPUInvalidReqdWGSize1D>
+    : public unittest::MockKernelInfoBase {
+  static constexpr const char *getName() {
+    return "TestKernelCPUInvalidReqdWGSize1D";
   }
-  static constexpr const char *getName() { return "TestKernelCPUInvalidReqdWGSize1D"; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
-} // namespace detail
-} // __SYCL_INLINE_VER_NAMESPACE(_V1)
-} // namespace sycl
-
-namespace sycl {
-__SYCL_INLINE_VER_NAMESPACE(_V1) {
-namespace detail {
-template <> struct KernelInfo<TestKernelCPUInvalidReqdWGSize2D> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
+template <>
+struct KernelInfo<TestKernelCPUInvalidReqdWGSize2D>
+    : public unittest::MockKernelInfoBase {
+  static constexpr const char *getName() {
+    return "TestKernelCPUInvalidReqdWGSize2D";
   }
-  static constexpr const char *getName() { return "TestKernelCPUInvalidReqdWGSize2D"; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
-} // namespace detail
-} // __SYCL_INLINE_VER_NAMESPACE(_V1)
-} // namespace sycl
-
-namespace sycl {
-__SYCL_INLINE_VER_NAMESPACE(_V1) {
-namespace detail {
-template <> struct KernelInfo<TestKernelCPUInvalidReqdWGSize3D> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<TestKernelCPUInvalidReqdWGSize3D>
+    : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() {
     return "TestKernelCPUInvalidReqdWGSize3D";
   }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
-} // namespace detail
-} // __SYCL_INLINE_VER_NAMESPACE(_V1)
-} // namespace sycl
-
-namespace sycl {
-__SYCL_INLINE_VER_NAMESPACE(_V1) {
-namespace detail {
-template <> struct KernelInfo<TestKernelCPUValidReqdWGSize3D> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<TestKernelCPUValidReqdWGSize3D>
+    : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() {
     return "TestKernelCPUValidReqdWGSize3D";
   }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
-} // namespace detail
-} // __SYCL_INLINE_VER_NAMESPACE(_V1)
-} // namespace sycl
-
-namespace sycl {
-__SYCL_INLINE_VER_NAMESPACE(_V1) {
-namespace detail {
-template <> struct KernelInfo<TestKernelGPU> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<TestKernelGPU> : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return "TestKernelGPU"; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
-} // namespace detail
-} // __SYCL_INLINE_VER_NAMESPACE(_V1)
-} // namespace sycl
-
-namespace sycl {
-__SYCL_INLINE_VER_NAMESPACE(_V1) {
-namespace detail {
-template <> struct KernelInfo<TestKernelACC> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<TestKernelACC> : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return "TestKernelACC"; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
 } // namespace detail
diff --git a/sycl/unittests/SYCL2020/KernelBundle.cpp b/sycl/unittests/SYCL2020/KernelBundle.cpp
index 28755557ba5ae..6cf41ac0924bb 100644
--- a/sycl/unittests/SYCL2020/KernelBundle.cpp
+++ b/sycl/unittests/SYCL2020/KernelBundle.cpp
@@ -10,6 +10,7 @@
 #include <detail/kernel_bundle_impl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <helpers/MockKernelInfo.hpp>
 #include <helpers/PiImage.hpp>
 #include <helpers/PiMock.hpp>
 
@@ -22,43 +23,19 @@ class TestKernelWithAspects;
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
-template <> struct KernelInfo<TestKernel> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<TestKernel> : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return "TestKernel"; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
-template <> struct KernelInfo<TestKernelExeOnly> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<TestKernelExeOnly> : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return "TestKernelExeOnly"; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
-template <> struct KernelInfo<TestKernelWithAspects> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<TestKernelWithAspects> : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return "TestKernelWithAspects"; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
 } // namespace detail
diff --git a/sycl/unittests/SYCL2020/KernelBundleStateFiltering.cpp b/sycl/unittests/SYCL2020/KernelBundleStateFiltering.cpp
new file mode 100644
index 0000000000000..fb4e7f3315be9
--- /dev/null
+++ b/sycl/unittests/SYCL2020/KernelBundleStateFiltering.cpp
@@ -0,0 +1,213 @@
+//==---- KernelBundleStateFiltering.cpp --- Kernel bundle unit test --------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <detail/device_impl.hpp>
+#include <detail/kernel_bundle_impl.hpp>
+#include <sycl/sycl.hpp>
+
+#include <helpers/MockKernelInfo.hpp>
+#include <helpers/PiImage.hpp>
+#include <helpers/PiMock.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <set>
+#include <vector>
+
+class KernelA;
+class KernelB;
+class KernelC;
+class KernelD;
+class KernelE;
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace detail {
+template <> struct KernelInfo<KernelA> : public unittest::MockKernelInfoBase {
+  static constexpr const char *getName() { return "KernelA"; }
+};
+template <> struct KernelInfo<KernelB> : public unittest::MockKernelInfoBase {
+  static constexpr const char *getName() { return "KernelB"; }
+};
+template <> struct KernelInfo<KernelC> : public unittest::MockKernelInfoBase {
+  static constexpr const char *getName() { return "KernelC"; }
+};
+template <> struct KernelInfo<KernelD> : public unittest::MockKernelInfoBase {
+  static constexpr const char *getName() { return "KernelD"; }
+};
+template <> struct KernelInfo<KernelE> : public unittest::MockKernelInfoBase {
+  static constexpr const char *getName() { return "KernelE"; }
+};
+} // namespace detail
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
+
+namespace {
+
+std::set<const void *> TrackedImages;
+sycl::unittest::PiImage
+generateDefaultImage(std::initializer_list<std::string> KernelNames,
+                     pi_device_binary_type BinaryType,
+                     const char *DeviceTargetSpec) {
+  using namespace sycl::unittest;
+
+  PiPropertySet PropSet;
+
+  static unsigned char NImage = 0;
+  std::vector<unsigned char> Bin{NImage++};
+
+  PiArray<PiOffloadEntry> Entries = makeEmptyKernels(KernelNames);
+
+  PiImage Img{BinaryType, // Format
+              DeviceTargetSpec,
+              "", // Compile options
+              "", // Link options
+              std::move(Bin),
+              std::move(Entries),
+              std::move(PropSet)};
+  const void *BinaryPtr = Img.getBinaryPtr();
+  TrackedImages.insert(BinaryPtr);
+
+  return Img;
+}
+
+// Image 0: input, KernelA KernelB
+// Image 1: exe, KernelA
+// Image 2: input, KernelC
+// Image 3: exe, KernelC
+// Image 4: input, KernelD
+// Image 5: input, KernelE
+// Image 6: exe, KernelE
+// Image 7: exe. KernelE
+sycl::unittest::PiImage Imgs[] = {
+    generateDefaultImage({"KernelA", "KernelB"}, PI_DEVICE_BINARY_TYPE_SPIRV,
+                         __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64),
+    generateDefaultImage({"KernelA"}, PI_DEVICE_BINARY_TYPE_NATIVE,
+                         __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_X86_64),
+    generateDefaultImage({"KernelC"}, PI_DEVICE_BINARY_TYPE_SPIRV,
+                         __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64),
+    generateDefaultImage({"KernelC"}, PI_DEVICE_BINARY_TYPE_NATIVE,
+                         __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_X86_64),
+    generateDefaultImage({"KernelD"}, PI_DEVICE_BINARY_TYPE_SPIRV,
+                         __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64),
+    generateDefaultImage({"KernelE"}, PI_DEVICE_BINARY_TYPE_SPIRV,
+                         __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64),
+    generateDefaultImage({"KernelE"}, PI_DEVICE_BINARY_TYPE_NATIVE,
+                         __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_X86_64),
+    generateDefaultImage({"KernelE"}, PI_DEVICE_BINARY_TYPE_NATIVE,
+                         __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_X86_64)};
+
+sycl::unittest::PiImageArray<std::size(Imgs)> ImgArray{Imgs};
+std::vector<unsigned char> UsedImageIndices;
+
+void redefinedPiProgramCreateCommon(const void *bin) {
+  if (TrackedImages.count(bin) != 0) {
+    unsigned char ImgIdx = *reinterpret_cast<const unsigned char *>(bin);
+    UsedImageIndices.push_back(ImgIdx);
+  }
+}
+
+pi_result redefinedPiProgramCreate(pi_context context, const void *il,
+                                   size_t length, pi_program *res_program) {
+  redefinedPiProgramCreateCommon(il);
+  return PI_SUCCESS;
+}
+
+pi_result redefinedPiProgramCreateWithBinary(
+    pi_context context, pi_uint32 num_devices, const pi_device *device_list,
+    const size_t *lengths, const unsigned char **binaries,
+    size_t num_metadata_entries, const pi_device_binary_property *metadata,
+    pi_int32 *binary_status, pi_program *ret_program) {
+  redefinedPiProgramCreateCommon(binaries[0]);
+  return PI_SUCCESS;
+}
+
+pi_result redefinedDevicesGet(pi_platform platform, pi_device_type device_type,
+                              pi_uint32 num_entries, pi_device *devices,
+                              pi_uint32 *num_devices) {
+  if (num_devices)
+    *num_devices = 2;
+
+  if (devices) {
+    devices[0] = reinterpret_cast<pi_device>(1);
+    devices[1] = reinterpret_cast<pi_device>(2);
+  }
+
+  return PI_SUCCESS;
+}
+
+pi_result redefinedExtDeviceSelectBinary(pi_device device,
+                                         pi_device_binary *binaries,
+                                         pi_uint32 num_binaries,
+                                         pi_uint32 *selected_binary_ind) {
+  EXPECT_EQ(num_binaries, 1U);
+  // Treat image 3 as incompatible with one of the devices.
+  if (TrackedImages.count(binaries[0]->BinaryStart) != 0 &&
+      *binaries[0]->BinaryStart == 3 &&
+      device == reinterpret_cast<pi_device>(2)) {
+    return PI_ERROR_INVALID_BINARY;
+  }
+  *selected_binary_ind = 0;
+  return PI_SUCCESS;
+}
+
+void verifyImageUse(const std::vector<unsigned char> &ExpectedImages) {
+  std::sort(UsedImageIndices.begin(), UsedImageIndices.end());
+  EXPECT_TRUE(std::is_sorted(ExpectedImages.begin(), ExpectedImages.end()));
+  EXPECT_EQ(UsedImageIndices, ExpectedImages);
+  UsedImageIndices.clear();
+}
+
+TEST(KernelBundle, DeviceImageStateFiltering) {
+  sycl::unittest::PiMock Mock;
+  Mock.redefineAfter<sycl::detail::PiApiKind::piProgramCreate>(
+      redefinedPiProgramCreate);
+  Mock.redefineAfter<sycl::detail::PiApiKind::piProgramCreateWithBinary>(
+      redefinedPiProgramCreateWithBinary);
+
+  // No kernel ids specified.
+  {
+    const sycl::device Dev = Mock.getPlatform().get_devices()[0];
+    sycl::context Ctx{Dev};
+
+    sycl::kernel_bundle<sycl::bundle_state::executable> KernelBundle =
+        sycl::get_kernel_bundle<sycl::bundle_state::executable>(Ctx, {Dev});
+    verifyImageUse({0, 1, 3, 4, 6, 7});
+  }
+
+  sycl::kernel_id KernelAID = sycl::get_kernel_id<KernelA>();
+  sycl::kernel_id KernelCID = sycl::get_kernel_id<KernelC>();
+  sycl::kernel_id KernelDID = sycl::get_kernel_id<KernelD>();
+
+  // Request specific kernel ids.
+  {
+    const sycl::device Dev = Mock.getPlatform().get_devices()[0];
+    sycl::context Ctx{Dev};
+
+    sycl::kernel_bundle<sycl::bundle_state::executable> KernelBundle =
+        sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+            Ctx, {Dev}, {KernelAID, KernelCID, KernelDID});
+    verifyImageUse({1, 3, 4});
+  }
+
+  // Check the case where some executable images are unsupported by one of
+  // the devices.
+  {
+    Mock.redefine<sycl::detail::PiApiKind::piDevicesGet>(redefinedDevicesGet);
+    Mock.redefine<sycl::detail::PiApiKind::piextDeviceSelectBinary>(
+        redefinedExtDeviceSelectBinary);
+    const std::vector<sycl::device> Devs = Mock.getPlatform().get_devices();
+    sycl::context Ctx{Devs};
+
+    sycl::kernel_bundle<sycl::bundle_state::executable> KernelBundle =
+        sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+            Ctx, Devs, {KernelAID, KernelCID, KernelDID});
+    verifyImageUse({1, 2, 3, 4});
+  }
+}
+} // namespace
diff --git a/sycl/unittests/SYCL2020/KernelID.cpp b/sycl/unittests/SYCL2020/KernelID.cpp
index 3d7ad4a7f107a..75008fdf82e23 100644
--- a/sycl/unittests/SYCL2020/KernelID.cpp
+++ b/sycl/unittests/SYCL2020/KernelID.cpp
@@ -8,6 +8,7 @@
 
 #include <sycl/sycl.hpp>
 
+#include <helpers/MockKernelInfo.hpp>
 #include <helpers/PiImage.hpp>
 #include <helpers/PiMock.hpp>
 
@@ -21,58 +22,26 @@ class ServiceKernel1;
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
-template <> struct KernelInfo<TestKernel1> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<TestKernel1> : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return "KernelID_TestKernel1"; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
-template <> struct KernelInfo<TestKernel2> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<TestKernel2> : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return "KernelID_TestKernel2"; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
-template <> struct KernelInfo<TestKernel3> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<TestKernel3> : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return "KernelID_TestKernel3"; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
-template <> struct KernelInfo<ServiceKernel1> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<ServiceKernel1> : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() {
     return "_ZTSN2cl4sycl6detail23__sycl_service_kernel__14ServiceKernel1";
   }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 } // namespace detail
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
diff --git a/sycl/unittests/SYCL2020/SpecializationConstant.cpp b/sycl/unittests/SYCL2020/SpecializationConstant.cpp
index 8588ae7f9b718..4a2b65435678b 100644
--- a/sycl/unittests/SYCL2020/SpecializationConstant.cpp
+++ b/sycl/unittests/SYCL2020/SpecializationConstant.cpp
@@ -11,6 +11,7 @@
 #include <detail/device_image_impl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <helpers/MockKernelInfo.hpp>
 #include <helpers/PiImage.hpp>
 #include <helpers/PiMock.hpp>
 
@@ -22,19 +23,11 @@ const static sycl::specialization_id<int> SpecConst1{42};
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
-template <> struct KernelInfo<TestKernel> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<TestKernel> : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() {
     return "SpecializationConstant_TestKernel";
   }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
 template <> const char *get_spec_constant_symbolic_ID<SpecConst1>() {
diff --git a/sycl/unittests/assert/assert.cpp b/sycl/unittests/assert/assert.cpp
index 6bdda2e7a144c..3a9482193fbba 100644
--- a/sycl/unittests/assert/assert.cpp
+++ b/sycl/unittests/assert/assert.cpp
@@ -27,6 +27,7 @@
 #include <detail/context_impl.hpp>
 #include <detail/device_impl.hpp>
 
+#include <helpers/MockKernelInfo.hpp>
 #include <helpers/PiImage.hpp>
 #include <helpers/PiMock.hpp>
 
@@ -42,24 +43,17 @@ class TestKernel;
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
-template <> struct KernelInfo<TestKernel> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<TestKernel> : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return "TestKernel"; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
 static constexpr const kernel_param_desc_t Signatures[] = {
     {kernel_param_kind_t::kind_accessor, 4062, 0}};
 
 template <>
-struct KernelInfo<::sycl::detail::__sycl_service_kernel__::AssertInfoCopier> {
+struct KernelInfo<::sycl::detail::__sycl_service_kernel__::AssertInfoCopier>
+    : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() {
     return "_ZTSN2cl4sycl6detail23__sycl_service_kernel__16AssertInfoCopierE";
   }
@@ -68,9 +62,6 @@ struct KernelInfo<::sycl::detail::__sycl_service_kernel__::AssertInfoCopier> {
     assert(!Idx);
     return Signatures[Idx];
   }
-  static constexpr bool isESIMD() { return 0; }
-  static constexpr bool callsThisItem() { return 0; }
-  static constexpr bool callsAnyThisFreeFunction() { return 0; }
   static constexpr int64_t getKernelSize() {
     // The AssertInfoCopier service kernel lambda captures an accessor.
     return sizeof(sycl::accessor<sycl::detail::AssertHappened, 1,
diff --git a/sycl/unittests/helpers/MockKernelInfo.hpp b/sycl/unittests/helpers/MockKernelInfo.hpp
new file mode 100644
index 0000000000000..af17642b74dd7
--- /dev/null
+++ b/sycl/unittests/helpers/MockKernelInfo.hpp
@@ -0,0 +1,31 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <sycl/detail/kernel_desc.hpp>
+
+#include <cstdint>
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace unittest {
+struct MockKernelInfoBase {
+  static constexpr unsigned getNumParams() { return 0; }
+  static const detail::kernel_param_desc_t &getParamDesc(int) {
+    static detail::kernel_param_desc_t Dummy;
+    return Dummy;
+  }
+  static constexpr bool isESIMD() { return false; }
+  static constexpr bool callsThisItem() { return false; }
+  static constexpr bool callsAnyThisFreeFunction() { return false; }
+  static constexpr int64_t getKernelSize() { return 1; }
+};
+
+} // namespace unittest
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
diff --git a/sycl/unittests/helpers/PiImage.hpp b/sycl/unittests/helpers/PiImage.hpp
index 38b06eef6d242..5a536c96a569c 100644
--- a/sycl/unittests/helpers/PiImage.hpp
+++ b/sycl/unittests/helpers/PiImage.hpp
@@ -262,6 +262,7 @@ class PiImage {
         MPropertySet.end(),
     };
   }
+  const unsigned char *getBinaryPtr() { return &*MBinary.begin(); }
 
 private:
   uint16_t MVersion;
diff --git a/sycl/unittests/helpers/TestKernel.hpp b/sycl/unittests/helpers/TestKernel.hpp
index 3db52b419af37..abeeeaa53e9b5 100644
--- a/sycl/unittests/helpers/TestKernel.hpp
+++ b/sycl/unittests/helpers/TestKernel.hpp
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include "MockKernelInfo.hpp"
 #include "PiImage.hpp"
 
 template <size_t KernelSize = 1> class TestKernel;
@@ -15,16 +16,10 @@ template <size_t KernelSize = 1> class TestKernel;
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
-template <size_t KernelSize> struct KernelInfo<TestKernel<KernelSize>> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <size_t KernelSize>
+struct KernelInfo<TestKernel<KernelSize>>
+    : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return "TestKernel"; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
   static constexpr int64_t getKernelSize() { return KernelSize; }
   static constexpr const char *getFileName() { return "TestKernel.hpp"; }
   static constexpr const char *getFunctionName() {
diff --git a/sycl/unittests/kernel-and-program/Cache.cpp b/sycl/unittests/kernel-and-program/Cache.cpp
index a4cc58ea360ae..0bee628686087 100644
--- a/sycl/unittests/kernel-and-program/Cache.cpp
+++ b/sycl/unittests/kernel-and-program/Cache.cpp
@@ -15,6 +15,7 @@
 #include "detail/kernel_program_cache.hpp"
 #include "detail/program_impl.hpp"
 #include "sycl/detail/pi.h"
+#include <helpers/MockKernelInfo.hpp>
 #include <helpers/PiImage.hpp>
 #include <helpers/PiMock.hpp>
 #include <sycl/sycl.hpp>
@@ -40,22 +41,12 @@ namespace sycl {
 const static specialization_id<int> SpecConst1{42};
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
-struct MockKernelInfo {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
-};
-
-template <> struct KernelInfo<CacheTestKernel> : public MockKernelInfo {
+template <>
+struct KernelInfo<CacheTestKernel> : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return "CacheTestKernel"; }
 };
-template <> struct KernelInfo<CacheTestKernel2> : public MockKernelInfo {
+template <>
+struct KernelInfo<CacheTestKernel2> : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return "CacheTestKernel2"; }
 };
 template <> const char *get_spec_constant_symbolic_ID<SpecConst1>() {
@@ -68,7 +59,11 @@ template <> const char *get_spec_constant_symbolic_ID<SpecConst1>() {
 static sycl::unittest::PiImage generateDefaultImage() {
   using namespace sycl::unittest;
 
+  std::vector<char> SpecConstData;
+  PiProperty SC1 = makeSpecConstant<int>(SpecConstData, "SC1", {0}, {0}, {42});
+
   PiPropertySet PropSet;
+  addSpecConstants({SC1}, std::move(SpecConstData), PropSet);
 
   std::vector<unsigned char> Bin{0, 1, 2, 3, 4, 5}; // Random data
 
@@ -256,7 +251,7 @@ TEST_F(KernelAndProgramCacheTest, SpecConstantCacheNegative) {
   detail::KernelProgramCache::ProgramCache &Cache =
       CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
 
-  EXPECT_EQ(Cache.size(), 1U) << "Expect non-empty cache";
+  EXPECT_EQ(Cache.size(), 2U) << "Expect an entry for each build in the cache.";
 }
 
 // Check that kernel_bundle created through join() is not cached.
diff --git a/sycl/unittests/kernel-and-program/KernelBuildOptions.cpp b/sycl/unittests/kernel-and-program/KernelBuildOptions.cpp
index f67ffed3f81f1..5ad6a31c3a028 100644
--- a/sycl/unittests/kernel-and-program/KernelBuildOptions.cpp
+++ b/sycl/unittests/kernel-and-program/KernelBuildOptions.cpp
@@ -11,6 +11,7 @@
 #define __SYCL_INTERNAL_API
 #endif
 
+#include <helpers/MockKernelInfo.hpp>
 #include <helpers/PiImage.hpp>
 #include <helpers/PiMock.hpp>
 #include <sycl/sycl.hpp>
@@ -23,17 +24,10 @@ static std::string BuildOpts;
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
-template <> struct KernelInfo<BuildOptsTestKernel> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<BuildOptsTestKernel> : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return "BuildOptsTestKernel"; }
   static constexpr bool isESIMD() { return true; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
 } // namespace detail
diff --git a/sycl/unittests/kernel-and-program/MultipleDevsCache.cpp b/sycl/unittests/kernel-and-program/MultipleDevsCache.cpp
index 5f29139f336e0..650328f4a8f09 100644
--- a/sycl/unittests/kernel-and-program/MultipleDevsCache.cpp
+++ b/sycl/unittests/kernel-and-program/MultipleDevsCache.cpp
@@ -11,6 +11,7 @@
 #include "detail/context_impl.hpp"
 #include "detail/kernel_bundle_impl.hpp"
 #include "detail/kernel_program_cache.hpp"
+#include <helpers/MockKernelInfo.hpp>
 #include <helpers/PiImage.hpp>
 #include <helpers/PiMock.hpp>
 
@@ -25,19 +26,12 @@ class MultipleDevsCacheTestKernel;
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
-template <> struct KernelInfo<MultipleDevsCacheTestKernel> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<MultipleDevsCacheTestKernel>
+    : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() {
     return "MultipleDevsCacheTestKernel";
   }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
 } // namespace detail
diff --git a/sycl/unittests/program_manager/EliminatedArgMask.cpp b/sycl/unittests/program_manager/EliminatedArgMask.cpp
index d71902b5f2a40..e5e7f0e0cbee0 100644
--- a/sycl/unittests/program_manager/EliminatedArgMask.cpp
+++ b/sycl/unittests/program_manager/EliminatedArgMask.cpp
@@ -12,6 +12,7 @@
 #include <detail/scheduler/commands.hpp>
 #include <sycl/sycl.hpp>
 
+#include <helpers/MockKernelInfo.hpp>
 #include <helpers/PiImage.hpp>
 #include <helpers/PiMock.hpp>
 
@@ -26,30 +27,16 @@ constexpr unsigned EAMTestKernelNumArgs = 4;
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
-template <> struct KernelInfo<EAMTestKernel> {
+template <>
+struct KernelInfo<EAMTestKernel> : public unittest::MockKernelInfoBase {
   static constexpr unsigned getNumParams() { return EAMTestKernelNumArgs; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
   static constexpr const char *getName() { return EAMTestKernelName; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
-template <> struct KernelInfo<EAMTestKernel2> {
+template <>
+struct KernelInfo<EAMTestKernel2> : public unittest::MockKernelInfoBase {
   static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
   static constexpr const char *getName() { return EAMTestKernel2Name; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
 } // namespace detail
diff --git a/sycl/unittests/program_manager/passing_link_and_compile_options.cpp b/sycl/unittests/program_manager/passing_link_and_compile_options.cpp
index 95f2cf4ba5ca3..a1f36996ef019 100644
--- a/sycl/unittests/program_manager/passing_link_and_compile_options.cpp
+++ b/sycl/unittests/program_manager/passing_link_and_compile_options.cpp
@@ -10,6 +10,7 @@
 
 #include <sycl/detail/defines_elementary.hpp>
 
+#include <helpers/MockKernelInfo.hpp>
 #include <helpers/PiImage.hpp>
 #include <helpers/PiMock.hpp>
 
@@ -32,43 +33,22 @@ constexpr unsigned EAMTestKernelNumArgs3 = 4;
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
-template <> struct KernelInfo<EAMTestKernel1> {
+template <>
+struct KernelInfo<EAMTestKernel1> : public unittest::MockKernelInfoBase {
   static constexpr unsigned getNumParams() { return EAMTestKernelNumArgs1; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
   static constexpr const char *getName() { return EAMTestKernelName1; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
-template <> struct KernelInfo<EAMTestKernel2> {
+template <>
+struct KernelInfo<EAMTestKernel2> : public unittest::MockKernelInfoBase {
   static constexpr unsigned getNumParams() { return EAMTestKernelNumArgs2; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
   static constexpr const char *getName() { return EAMTestKernelName2; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
-template <> struct KernelInfo<EAMTestKernel3> {
+template <>
+struct KernelInfo<EAMTestKernel3> : public unittest::MockKernelInfoBase {
   static constexpr unsigned getNumParams() { return EAMTestKernelNumArgs3; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
   static constexpr const char *getName() { return EAMTestKernelName3; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
 } // namespace detail
diff --git a/sycl/unittests/queue/GetProfilingInfo.cpp b/sycl/unittests/queue/GetProfilingInfo.cpp
index 3d9293a5a3bf1..c3122ca64aca0 100644
--- a/sycl/unittests/queue/GetProfilingInfo.cpp
+++ b/sycl/unittests/queue/GetProfilingInfo.cpp
@@ -15,6 +15,7 @@
 
 #include <sycl/detail/defines_elementary.hpp>
 
+#include <helpers/MockKernelInfo.hpp>
 #include <helpers/PiImage.hpp>
 #include <helpers/PiMock.hpp>
 #include <helpers/TestKernel.hpp>
@@ -26,17 +27,9 @@ class InfoTestKernel;
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
-template <> struct KernelInfo<InfoTestKernel> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<InfoTestKernel> : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() { return "InfoTestKernel"; }
-  static constexpr bool isESIMD() { return false; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
-  static constexpr int64_t getKernelSize() { return 1; }
 };
 
 } // namespace detail
diff --git a/sycl/unittests/scheduler/CommandsWaitForEvents.cpp b/sycl/unittests/scheduler/CommandsWaitForEvents.cpp
index 31dfc8118b657..8e0105e992c01 100644
--- a/sycl/unittests/scheduler/CommandsWaitForEvents.cpp
+++ b/sycl/unittests/scheduler/CommandsWaitForEvents.cpp
@@ -8,6 +8,7 @@
 
 #include "SchedulerTest.hpp"
 #include "SchedulerTestUtils.hpp"
+#include <helpers/MockKernelInfo.hpp>
 #include <helpers/PiImage.hpp>
 #include <helpers/PiMock.hpp>
 
@@ -85,18 +86,13 @@ class StreamAUXCmdsWait_TestKernel;
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
-template <> struct KernelInfo<StreamAUXCmdsWait_TestKernel> {
-  static constexpr unsigned getNumParams() { return 0; }
-  static const kernel_param_desc_t &getParamDesc(int) {
-    static kernel_param_desc_t Dummy;
-    return Dummy;
-  }
+template <>
+struct KernelInfo<StreamAUXCmdsWait_TestKernel>
+    : public unittest::MockKernelInfoBase {
   static constexpr const char *getName() {
     return "StreamAUXCmdsWait_TestKernel";
   }
   static constexpr bool isESIMD() { return true; }
-  static constexpr bool callsThisItem() { return false; }
-  static constexpr bool callsAnyThisFreeFunction() { return false; }
   static constexpr int64_t getKernelSize() { return sizeof(sycl::stream); }
 };