diff --git a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp
index cd619a94f71bd..bab10b79abe84 100644
--- a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp
+++ b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp
@@ -1350,11 +1350,18 @@ SmallPtrSet<Type *, 4> collectGenXVolatileTypes(Module &M) {
       continue;
     auto GTy = dyn_cast<StructType>(PTy->getPointerElementType());
     // TODO FIXME relying on type name in LLVM IR is fragile, needs rework
-    if (!GTy || !GTy->getName().endswith(
-                    "cl::sycl::ext::intel::experimental::esimd::simd"))
+    if (!GTy ||
+        !GTy->getName().endswith("sycl::ext::intel::experimental::esimd::simd"))
       continue;
     assert(GTy->getNumContainedTypes() == 1);
     auto VTy = GTy->getContainedType(0);
+    if (GTy = dyn_cast<StructType>(VTy)) {
+      assert(
+          GTy &&
+          GTy->getName().endswith(
+              "sycl::ext::intel::experimental::esimd::detail::simd_obj_impl"));
+      VTy = GTy->getContainedType(0);
+    }
     assert(VTy->isVectorTy());
     GenXVolatileTypeSet.insert(VTy);
   }
diff --git a/llvm/lib/SYCLLowerIR/LowerESIMDVecArg.cpp b/llvm/lib/SYCLLowerIR/LowerESIMDVecArg.cpp
index 9b30c72d3436a..848b289352a03 100644
--- a/llvm/lib/SYCLLowerIR/LowerESIMDVecArg.cpp
+++ b/llvm/lib/SYCLLowerIR/LowerESIMDVecArg.cpp
@@ -110,14 +110,18 @@ ModulePass *llvm::createESIMDLowerVecArgPass() {
 // nullptr.
 Type *ESIMDLowerVecArgPass::getSimdArgPtrTyOrNull(Value *arg) {
   auto ArgType = dyn_cast<PointerType>(arg->getType());
-  if (!ArgType || !ArgType->getElementType()->isStructTy())
+  if (!ArgType)
     return nullptr;
-  auto ContainedType = ArgType->getElementType();
-  if ((ContainedType->getStructNumElements() != 1) ||
-      !ContainedType->getStructElementType(0)->isVectorTy())
+  Type *Res = nullptr;
+  StructType *ST = dyn_cast_or_null<StructType>(ArgType->getElementType());
+
+  while (ST && (ST->getStructNumElements() == 1)) {
+    Res = ST->getStructElementType(0);
+    ST = dyn_cast<StructType>(Res);
+  }
+  if (!Res || !Res->isVectorTy())
     return nullptr;
-  return PointerType::get(ContainedType->getStructElementType(0),
-                          ArgType->getPointerAddressSpace());
+  return PointerType::get(Res, ArgType->getPointerAddressSpace());
 }
 
 // F may have multiple arguments of type simd*. This
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/intrin.hpp
index d8a493cd6b3d7..5d8661bb85cd5 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/intrin.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/intrin.hpp
@@ -18,9 +18,6 @@
 #include <assert.h>
 #include <cstdint>
 
-#define __SEIEED sycl::ext::intel::experimental::esimd::detail
-#define __SEIEE sycl::ext::intel::experimental::esimd
-
 // \brief __esimd_rdregion: region access intrinsic.
 //
 // @param T the element data type, one of i8, i16, i32, i64, half, float,
@@ -125,14 +122,14 @@ template <typename T, int N, int M, int VStride, int Width, int Stride,
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<T, N>
 __esimd_wrregion(__SEIEED::vector_type_t<T, N> OldVal,
                  __SEIEED::vector_type_t<T, M> NewVal, uint16_t Offset,
-                 __SEIEE::mask_type_t<M> Mask = 1);
+                 __SEIEED::simd_mask_storage_t<M> Mask = 1);
 
 template <typename T, int N, int M, int ParentWidth = 0>
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<T, N>
 __esimd_wrindirect(__SEIEED::vector_type_t<T, N> OldVal,
                    __SEIEED::vector_type_t<T, M> NewVal,
                    __SEIEED::vector_type_t<uint16_t, M> Offset,
-                   __SEIEE::mask_type_t<M> Mask = 1);
+                   __SEIEED::simd_mask_storage_t<M> Mask = 1);
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
@@ -286,7 +283,7 @@ template <typename T, int N, int M, int VStride, int Width, int Stride,
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<T, N>
 __esimd_wrregion(__SEIEED::vector_type_t<T, N> OldVal,
                  __SEIEED::vector_type_t<T, M> NewVal, uint16_t Offset,
-                 __SEIEE::mask_type_t<M> Mask) {
+                 __SEIEED::simd_mask_storage_t<M> Mask) {
   uint16_t EltOffset = Offset / sizeof(T);
   assert(Offset % sizeof(T) == 0);
 
@@ -310,7 +307,7 @@ SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<T, N>
 __esimd_wrindirect(__SEIEED::vector_type_t<T, N> OldVal,
                    __SEIEED::vector_type_t<T, M> NewVal,
                    __SEIEED::vector_type_t<uint16_t, M> Offset,
-                   __SEIEE::mask_type_t<M> Mask) {
+                   __SEIEED::simd_mask_storage_t<M> Mask) {
   __SEIEED::vector_type_t<T, N> Result = OldVal;
   for (int i = 0; i < M; ++i) {
     if (Mask[i]) {
@@ -324,6 +321,3 @@ __esimd_wrindirect(__SEIEED::vector_type_t<T, N> OldVal,
 }
 
 #endif // __SYCL_DEVICE_ONLY__
-
-#undef __SEIEE
-#undef __SEIEED
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp
index 619f9f598f198..7f2f5fc5bdd10 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp
@@ -15,11 +15,10 @@
 #include <sycl/ext/intel/experimental/esimd/common.hpp>
 #include <sycl/ext/intel/experimental/esimd/detail/host_util.hpp>
 #include <sycl/ext/intel/experimental/esimd/detail/types.hpp>
+#include <sycl/ext/intel/experimental/esimd/detail/util.hpp>
 
 #include <cstdint>
 
-#define __SEIEED sycl::ext::intel::experimental::esimd::detail
-
 // saturation intrinsics
 template <typename T0, typename T1, int SZ>
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<T0, SZ>
@@ -385,8 +384,6 @@ inline T extract(const uint32_t &width, const uint32_t &offset, uint32_t src,
   return ret;
 }
 
-#define __SEIEEED sycl::ext::intel::experimental::esimd::emu::detail
-
 template <typename T0, typename T1, int SZ>
 inline __SEIEED::vector_type_t<T0, SZ>
 __esimd_satf(__SEIEED::vector_type_t<T1, SZ> src) {
@@ -1327,8 +1324,4 @@ __esimd_reduced_smin(__SEIEED::vector_type_t<Ty, N> src1,
   return __esimd_reduced_min<Ty, N>(src1, src2);
 }
 
-#undef __SEIEEED
-
 #endif // #ifdef __SYCL_DEVICE_ONLY__
-
-#undef __SEIEED
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
index 03ec0ab45b171..1751c32d37084 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
@@ -74,9 +74,6 @@ constexpr unsigned int ElemsPerAddrDecoding(unsigned int ElemsPerAddrEncoded) {
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
 
-#define __SEIEE sycl::ext::intel::experimental::esimd
-#define __SEIEED sycl::ext::intel::experimental::esimd::detail
-
 // flat_read does flat-address gather
 template <typename Ty, int N, int NumBlk = 0,
           __SEIEE::CacheHint L1H = __SEIEE::CacheHint::None,
@@ -85,7 +82,7 @@ SYCL_EXTERNAL SYCL_ESIMD_FUNCTION
     __SEIEED::vector_type_t<Ty, N * __SEIEED::ElemsPerAddrDecoding(NumBlk)>
     __esimd_flat_read(__SEIEED::vector_type_t<uint64_t, N> addrs,
                       int ElemsPerAddr = NumBlk,
-                      __SEIEED::vector_type_t<uint16_t, N> pred = 1);
+                      __SEIEED::simd_mask_storage_t<N> pred = 1);
 
 // flat_write does flat-address scatter
 template <typename Ty, int N, int NumBlk = 0,
@@ -95,7 +92,7 @@ SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void __esimd_flat_write(
     __SEIEED::vector_type_t<uint64_t, N> addrs,
     __SEIEED::vector_type_t<Ty, N * __SEIEED::ElemsPerAddrDecoding(NumBlk)>
         vals,
-    int ElemsPerAddr = NumBlk, __SEIEED::vector_type_t<uint16_t, N> pred = 1);
+    int ElemsPerAddr = NumBlk, __SEIEED::simd_mask_storage_t<N> pred = 1);
 
 // flat_block_read reads a block of data from one flat address
 template <typename Ty, int N, __SEIEE::CacheHint L1H = __SEIEE::CacheHint::None,
@@ -127,7 +124,7 @@ template <typename Ty, int N, __SEIEE::rgba_channel_mask Mask,
 __SEIEED::vector_type_t<Ty, N * get_num_channels_enabled(Mask)>
     SYCL_EXTERNAL SYCL_ESIMD_FUNCTION
     __esimd_flat_read4(__SEIEED::vector_type_t<uint64_t, N> addrs,
-                       __SEIEED::vector_type_t<uint16_t, N> pred = 1);
+                       __SEIEED::simd_mask_storage_t<N> pred = 1);
 
 // flat_write does flat-address scatter
 template <typename Ty, int N, __SEIEE::rgba_channel_mask Mask,
@@ -136,7 +133,7 @@ template <typename Ty, int N, __SEIEE::rgba_channel_mask Mask,
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void __esimd_flat_write4(
     __SEIEED::vector_type_t<uint64_t, N> addrs,
     __SEIEED::vector_type_t<Ty, N * get_num_channels_enabled(Mask)> vals,
-    __SEIEED::vector_type_t<uint16_t, N> pred = 1);
+    __SEIEED::simd_mask_storage_t<N> pred = 1);
 
 // Low-level surface-based gather. Collects elements located at given offsets in
 // a surface and returns them as a single \ref simd object. Element can be
@@ -205,7 +202,7 @@ template <typename Ty, int N, typename SurfIndAliasTy, int TySizeLog2,
           __SEIEE::CacheHint L1H = __SEIEE::CacheHint::None,
           __SEIEE::CacheHint L3H = __SEIEE::CacheHint::None>
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void
-__esimd_surf_write(__SEIEED::vector_type_t<uint16_t, N> pred, int16_t scale,
+__esimd_surf_write(__SEIEED::simd_mask_storage_t<N> pred, int16_t scale,
                    SurfIndAliasTy surf_ind, uint32_t global_offset,
                    __SEIEED::vector_type_t<uint32_t, N> elem_offsets,
                    __SEIEED::vector_type_t<Ty, N> vals)
@@ -229,7 +226,7 @@ template <__SEIEE::atomic_op Op, typename Ty, int N,
           __SEIEE::CacheHint L3H = __SEIEE::CacheHint::None>
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<Ty, N>
 __esimd_flat_atomic0(__SEIEED::vector_type_t<uint64_t, N> addrs,
-                     __SEIEED::vector_type_t<uint16_t, N> pred);
+                     __SEIEED::simd_mask_storage_t<N> pred);
 
 template <__SEIEE::atomic_op Op, typename Ty, int N,
           __SEIEE::CacheHint L1H = __SEIEE::CacheHint::None,
@@ -237,7 +234,7 @@ template <__SEIEE::atomic_op Op, typename Ty, int N,
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<Ty, N>
 __esimd_flat_atomic1(__SEIEED::vector_type_t<uint64_t, N> addrs,
                      __SEIEED::vector_type_t<Ty, N> src0,
-                     __SEIEED::vector_type_t<uint16_t, N> pred);
+                     __SEIEED::simd_mask_storage_t<N> pred);
 
 template <__SEIEE::atomic_op Op, typename Ty, int N,
           __SEIEE::CacheHint L1H = __SEIEE::CacheHint::None,
@@ -246,7 +243,7 @@ SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<Ty, N>
 __esimd_flat_atomic2(__SEIEED::vector_type_t<uint64_t, N> addrs,
                      __SEIEED::vector_type_t<Ty, N> src0,
                      __SEIEED::vector_type_t<Ty, N> src1,
-                     __SEIEED::vector_type_t<uint16_t, N> pred);
+                     __SEIEED::simd_mask_storage_t<N> pred);
 
 // esimd_barrier, generic group barrier
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void __esimd_barrier();
@@ -262,14 +259,14 @@ SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void __esimd_slm_fence(uint8_t cntl);
 template <typename Ty, int N>
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<Ty, N>
 __esimd_slm_read(__SEIEED::vector_type_t<uint32_t, N> addrs,
-                 __SEIEED::vector_type_t<uint16_t, N> pred = 1);
+                 __SEIEED::simd_mask_storage_t<N> pred = 1);
 
 // slm_write does SLM scatter
 template <typename Ty, int N>
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void
 __esimd_slm_write(__SEIEED::vector_type_t<uint32_t, N> addrs,
                   __SEIEED::vector_type_t<Ty, N> vals,
-                  __SEIEED::vector_type_t<uint16_t, N> pred = 1);
+                  __SEIEED::simd_mask_storage_t<N> pred = 1);
 
 // slm_block_read reads a block of data from SLM
 template <typename Ty, int N>
@@ -286,33 +283,33 @@ template <typename Ty, int N, __SEIEE::rgba_channel_mask Mask>
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION
     __SEIEED::vector_type_t<Ty, N * get_num_channels_enabled(Mask)>
     __esimd_slm_read4(__SEIEED::vector_type_t<uint32_t, N> addrs,
-                      __SEIEED::vector_type_t<uint16_t, N> pred = 1);
+                      __SEIEED::simd_mask_storage_t<N> pred = 1);
 
 // slm_write4 does SLM scatter4
 template <typename Ty, int N, __SEIEE::rgba_channel_mask Mask>
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void __esimd_slm_write4(
     __SEIEED::vector_type_t<uint32_t, N> addrs,
     __SEIEED::vector_type_t<Ty, N * get_num_channels_enabled(Mask)> vals,
-    __SEIEED::vector_type_t<uint16_t, N> pred = 1);
+    __SEIEED::simd_mask_storage_t<N> pred = 1);
 
 // slm_atomic: SLM atomic
 template <__SEIEE::atomic_op Op, typename Ty, int N>
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<Ty, N>
 __esimd_slm_atomic0(__SEIEED::vector_type_t<uint32_t, N> addrs,
-                    __SEIEED::vector_type_t<uint16_t, N> pred);
+                    __SEIEED::simd_mask_storage_t<N> pred);
 
 template <__SEIEE::atomic_op Op, typename Ty, int N>
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<Ty, N>
 __esimd_slm_atomic1(__SEIEED::vector_type_t<uint32_t, N> addrs,
                     __SEIEED::vector_type_t<Ty, N> src0,
-                    __SEIEED::vector_type_t<uint16_t, N> pred);
+                    __SEIEED::simd_mask_storage_t<N> pred);
 
 template <__SEIEE::atomic_op Op, typename Ty, int N>
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<Ty, N>
 __esimd_slm_atomic2(__SEIEED::vector_type_t<uint32_t, N> addrs,
                     __SEIEED::vector_type_t<Ty, N> src0,
                     __SEIEED::vector_type_t<Ty, N> src1,
-                    __SEIEED::vector_type_t<uint16_t, N> pred);
+                    __SEIEED::simd_mask_storage_t<N> pred);
 
 // Media block load
 //
@@ -418,9 +415,9 @@ template <typename Ty1, int N1, typename Ty2, int N2, typename Ty3, int N3,
           int N = 16>
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<Ty1, N1>
 __esimd_raw_sends_load(uint8_t modifier, uint8_t execSize,
-                       __SEIEED::vector_type_t<uint16_t, N> pred,
-                       uint8_t numSrc0, uint8_t numSrc1, uint8_t numDst,
-                       uint8_t sfid, uint32_t exDesc, uint32_t msgDesc,
+                       __SEIEED::simd_mask_storage_t<N> pred, uint8_t numSrc0,
+                       uint8_t numSrc1, uint8_t numDst, uint8_t sfid,
+                       uint32_t exDesc, uint32_t msgDesc,
                        __SEIEED::vector_type_t<Ty2, N2> msgSrc0,
                        __SEIEED::vector_type_t<Ty3, N3> msgSrc1,
                        __SEIEED::vector_type_t<Ty1, N1> msgDst);
@@ -454,9 +451,9 @@ __esimd_raw_sends_load(uint8_t modifier, uint8_t execSize,
 template <typename Ty1, int N1, typename Ty2, int N2, int N = 16>
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<Ty1, N1>
 __esimd_raw_send_load(uint8_t modifier, uint8_t execSize,
-                      __SEIEED::vector_type_t<uint16_t, N> pred,
-                      uint8_t numSrc0, uint8_t numDst, uint8_t sfid,
-                      uint32_t exDesc, uint32_t msgDesc,
+                      __SEIEED::simd_mask_storage_t<N> pred, uint8_t numSrc0,
+                      uint8_t numDst, uint8_t sfid, uint32_t exDesc,
+                      uint32_t msgDesc,
                       __SEIEED::vector_type_t<Ty2, N2> msgSrc0,
                       __SEIEED::vector_type_t<Ty1, N1> msgDst);
 
@@ -485,13 +482,11 @@ __esimd_raw_send_load(uint8_t modifier, uint8_t execSize,
 /// @param msgSrc1 the second source operand of send message.
 ///
 template <typename Ty1, int N1, typename Ty2, int N2, int N = 16>
-SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void
-__esimd_raw_sends_store(uint8_t modifier, uint8_t execSize,
-                        __SEIEED::vector_type_t<uint16_t, N> pred,
-                        uint8_t numSrc0, uint8_t numSrc1, uint8_t sfid,
-                        uint32_t exDesc, uint32_t msgDesc,
-                        __SEIEED::vector_type_t<Ty1, N1> msgSrc0,
-                        __SEIEED::vector_type_t<Ty2, N2> msgSrc1);
+SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void __esimd_raw_sends_store(
+    uint8_t modifier, uint8_t execSize, __SEIEED::simd_mask_storage_t<N> pred,
+    uint8_t numSrc0, uint8_t numSrc1, uint8_t sfid, uint32_t exDesc,
+    uint32_t msgDesc, __SEIEED::vector_type_t<Ty1, N1> msgSrc0,
+    __SEIEED::vector_type_t<Ty2, N2> msgSrc1);
 
 /// \brief Raw send store.
 ///
@@ -515,9 +510,8 @@ __esimd_raw_sends_store(uint8_t modifier, uint8_t execSize,
 template <typename Ty1, int N1, int N = 16>
 SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void
 __esimd_raw_send_store(uint8_t modifier, uint8_t execSize,
-                       __SEIEED::vector_type_t<uint16_t, N> pred,
-                       uint8_t numSrc0, uint8_t sfid, uint32_t exDesc,
-                       uint32_t msgDesc,
+                       __SEIEED::simd_mask_storage_t<N> pred, uint8_t numSrc0,
+                       uint8_t sfid, uint32_t exDesc, uint32_t msgDesc,
                        __SEIEED::vector_type_t<Ty1, N1> msgSrc0);
 #ifndef __SYCL_DEVICE_ONLY__
 
@@ -525,7 +519,7 @@ template <typename Ty, int N, int NumBlk, __SEIEE::CacheHint L1H,
           __SEIEE::CacheHint L3H>
 inline __SEIEED::vector_type_t<Ty, N * __SEIEED::ElemsPerAddrDecoding(NumBlk)>
 __esimd_flat_read(__SEIEED::vector_type_t<uint64_t, N> addrs, int ElemsPerAddr,
-                  __SEIEED::vector_type_t<uint16_t, N> pred) {
+                  __SEIEED::simd_mask_storage_t<N> pred) {
   auto NumBlkDecoded = __SEIEED::ElemsPerAddrDecoding(NumBlk);
   __SEIEED::vector_type_t<Ty, N * __SEIEED::ElemsPerAddrDecoding(NumBlk)> V;
   ElemsPerAddr = __SEIEED::ElemsPerAddrDecoding(ElemsPerAddr);
@@ -551,7 +545,7 @@ template <typename Ty, int N, __SEIEE::rgba_channel_mask Mask,
           __SEIEE::CacheHint L1H, __SEIEE::CacheHint L3H>
 inline __SEIEED::vector_type_t<Ty, N * get_num_channels_enabled(Mask)>
 __esimd_flat_read4(__SEIEED::vector_type_t<uint64_t, N> addrs,
-                   __SEIEED::vector_type_t<uint16_t, N> pred) {
+                   __SEIEED::simd_mask_storage_t<N> pred) {
   __SEIEED::vector_type_t<Ty, N * get_num_channels_enabled(Mask)> V;
   unsigned int Next = 0;
 
@@ -601,7 +595,7 @@ inline void __esimd_flat_write(
     __SEIEED::vector_type_t<uint64_t, N> addrs,
     __SEIEED::vector_type_t<Ty, N * __SEIEED::ElemsPerAddrDecoding(NumBlk)>
         vals,
-    int ElemsPerAddr, __SEIEED::vector_type_t<uint16_t, N> pred) {
+    int ElemsPerAddr, __SEIEED::simd_mask_storage_t<N> pred) {
   auto NumBlkDecoded = __SEIEED::ElemsPerAddrDecoding(NumBlk);
   ElemsPerAddr = __SEIEED::ElemsPerAddrDecoding(ElemsPerAddr);
 
@@ -626,7 +620,7 @@ template <typename Ty, int N, __SEIEE::rgba_channel_mask Mask,
 inline void __esimd_flat_write4(
     __SEIEED::vector_type_t<uint64_t, N> addrs,
     __SEIEED::vector_type_t<Ty, N * get_num_channels_enabled(Mask)> vals,
-    __SEIEED::vector_type_t<uint16_t, N> pred) {
+    __SEIEED::simd_mask_storage_t<N> pred) {
   __SEIEED::vector_type_t<Ty, N * get_num_channels_enabled(Mask)> V;
   unsigned int Next = 0;
 
@@ -830,7 +824,7 @@ inline void __esimd_slm_fence(uint8_t cntl) {}
 template <typename Ty, int N>
 inline __SEIEED::vector_type_t<Ty, N>
 __esimd_slm_read(__SEIEED::vector_type_t<uint32_t, N> addrs,
-                 __SEIEED::vector_type_t<uint16_t, N> pred) {
+                 __SEIEED::simd_mask_storage_t<N> pred) {
   __SEIEED::vector_type_t<Ty, N> retv;
   return retv;
 }
@@ -839,7 +833,7 @@ __esimd_slm_read(__SEIEED::vector_type_t<uint32_t, N> addrs,
 template <typename Ty, int N>
 inline void __esimd_slm_write(__SEIEED::vector_type_t<uint32_t, N> addrs,
                               __SEIEED::vector_type_t<Ty, N> vals,
-                              __SEIEED::vector_type_t<uint16_t, N> pred) {}
+                              __SEIEED::simd_mask_storage_t<N> pred) {}
 
 // slm_block_read reads a block of data from SLM
 template <typename Ty, int N>
@@ -857,7 +851,7 @@ inline void __esimd_slm_block_write(uint32_t addr,
 template <typename Ty, int N, __SEIEE::rgba_channel_mask Mask>
 inline __SEIEED::vector_type_t<Ty, N * get_num_channels_enabled(Mask)>
 __esimd_slm_read4(__SEIEED::vector_type_t<uint32_t, N> addrs,
-                  __SEIEED::vector_type_t<uint16_t, N> pred) {
+                  __SEIEED::simd_mask_storage_t<N> pred) {
   __SEIEED::vector_type_t<Ty, N * get_num_channels_enabled(Mask)> retv;
   return retv;
 }
@@ -867,13 +861,13 @@ template <typename Ty, int N, __SEIEE::rgba_channel_mask Mask>
 inline void __esimd_slm_write4(
     __SEIEED::vector_type_t<uint32_t, N> addrs,
     __SEIEED::vector_type_t<Ty, N * get_num_channels_enabled(Mask)> vals,
-    __SEIEED::vector_type_t<uint16_t, N> pred) {}
+    __SEIEED::simd_mask_storage_t<N> pred) {}
 
 // slm_atomic: SLM atomic
 template <__SEIEE::atomic_op Op, typename Ty, int N>
 inline __SEIEED::vector_type_t<Ty, N>
 __esimd_slm_atomic0(__SEIEED::vector_type_t<uint32_t, N> addrs,
-                    __SEIEED::vector_type_t<uint16_t, N> pred) {
+                    __SEIEED::simd_mask_storage_t<N> pred) {
   __SEIEED::vector_type_t<Ty, N> retv;
   return retv;
 }
@@ -882,7 +876,7 @@ template <__SEIEE::atomic_op Op, typename Ty, int N>
 inline __SEIEED::vector_type_t<Ty, N>
 __esimd_slm_atomic1(__SEIEED::vector_type_t<uint32_t, N> addrs,
                     __SEIEED::vector_type_t<Ty, N> src0,
-                    __SEIEED::vector_type_t<uint16_t, N> pred) {
+                    __SEIEED::simd_mask_storage_t<N> pred) {
   __SEIEED::vector_type_t<Ty, N> retv;
   return retv;
 }
@@ -892,7 +886,7 @@ inline __SEIEED::vector_type_t<Ty, N>
 __esimd_slm_atomic2(__SEIEED::vector_type_t<uint32_t, N> addrs,
                     __SEIEED::vector_type_t<Ty, N> src0,
                     __SEIEED::vector_type_t<Ty, N> src1,
-                    __SEIEED::vector_type_t<uint16_t, N> pred) {
+                    __SEIEED::simd_mask_storage_t<N> pred) {
   __SEIEED::vector_type_t<Ty, N> retv;
   return retv;
 }
@@ -901,7 +895,7 @@ template <__SEIEE::atomic_op Op, typename Ty, int N, __SEIEE::CacheHint L1H,
           __SEIEE::CacheHint L3H>
 inline __SEIEED::vector_type_t<Ty, N>
 __esimd_flat_atomic0(__SEIEED::vector_type_t<uint64_t, N> addrs,
-                     __SEIEED::vector_type_t<uint16_t, N> pred) {
+                     __SEIEED::simd_mask_storage_t<N> pred) {
   __SEIEED::vector_type_t<Ty, N> retv;
   return retv;
 }
@@ -911,7 +905,7 @@ template <__SEIEE::atomic_op Op, typename Ty, int N, __SEIEE::CacheHint L1H,
 inline __SEIEED::vector_type_t<Ty, N>
 __esimd_flat_atomic1(__SEIEED::vector_type_t<uint64_t, N> addrs,
                      __SEIEED::vector_type_t<Ty, N> src0,
-                     __SEIEED::vector_type_t<uint16_t, N> pred) {
+                     __SEIEED::simd_mask_storage_t<N> pred) {
   __SEIEED::vector_type_t<Ty, N> retv;
   return retv;
 }
@@ -922,7 +916,7 @@ inline __SEIEED::vector_type_t<Ty, N>
 __esimd_flat_atomic2(__SEIEED::vector_type_t<uint64_t, N> addrs,
                      __SEIEED::vector_type_t<Ty, N> src0,
                      __SEIEED::vector_type_t<Ty, N> src1,
-                     __SEIEED::vector_type_t<uint16_t, N> pred) {
+                     __SEIEED::simd_mask_storage_t<N> pred) {
   __SEIEED::vector_type_t<Ty, N> retv;
   return retv;
 }
@@ -986,14 +980,12 @@ inline uint32_t __esimd_get_value(AccessorTy acc) {
 ///
 template <typename Ty1, int N1, typename Ty2, int N2, typename Ty3, int N3,
           int N>
-inline __SEIEED::vector_type_t<Ty1, N1>
-__esimd_raw_sends_load(uint8_t modifier, uint8_t execSize,
-                       __SEIEED::vector_type_t<uint16_t, N> pred,
-                       uint8_t numSrc0, uint8_t numSrc1, uint8_t numDst,
-                       uint8_t sfid, uint32_t exDesc, uint32_t msgDesc,
-                       __SEIEED::vector_type_t<Ty2, N2> msgSrc0,
-                       __SEIEED::vector_type_t<Ty3, N3> msgSrc1,
-                       __SEIEED::vector_type_t<Ty1, N1> msgDst) {
+inline __SEIEED::vector_type_t<Ty1, N1> __esimd_raw_sends_load(
+    uint8_t modifier, uint8_t execSize, __SEIEED::simd_mask_storage_t<N> pred,
+    uint8_t numSrc0, uint8_t numSrc1, uint8_t numDst, uint8_t sfid,
+    uint32_t exDesc, uint32_t msgDesc, __SEIEED::vector_type_t<Ty2, N2> msgSrc0,
+    __SEIEED::vector_type_t<Ty3, N3> msgSrc1,
+    __SEIEED::vector_type_t<Ty1, N1> msgDst) {
   throw cl::sycl::feature_not_supported();
   return 0;
 }
@@ -1025,13 +1017,11 @@ __esimd_raw_sends_load(uint8_t modifier, uint8_t execSize,
 /// Returns a simd vector of type Ty1 and size N1.
 ///
 template <typename Ty1, int N1, typename Ty2, int N2, int N>
-inline __SEIEED::vector_type_t<Ty1, N1>
-__esimd_raw_send_load(uint8_t modifier, uint8_t execSize,
-                      __SEIEED::vector_type_t<uint16_t, N> pred,
-                      uint8_t numSrc0, uint8_t numDst, uint8_t sfid,
-                      uint32_t exDesc, uint32_t msgDesc,
-                      __SEIEED::vector_type_t<Ty2, N2> msgSrc0,
-                      __SEIEED::vector_type_t<Ty1, N1> msgDst) {
+inline __SEIEED::vector_type_t<Ty1, N1> __esimd_raw_send_load(
+    uint8_t modifier, uint8_t execSize, __SEIEED::simd_mask_storage_t<N> pred,
+    uint8_t numSrc0, uint8_t numDst, uint8_t sfid, uint32_t exDesc,
+    uint32_t msgDesc, __SEIEED::vector_type_t<Ty2, N2> msgSrc0,
+    __SEIEED::vector_type_t<Ty1, N1> msgDst) {
   throw cl::sycl::feature_not_supported();
   return 0;
 }
@@ -1062,7 +1052,7 @@ __esimd_raw_send_load(uint8_t modifier, uint8_t execSize,
 ///
 template <typename Ty1, int N1, typename Ty2, int N2, int N>
 inline void __esimd_raw_sends_store(uint8_t modifier, uint8_t execSize,
-                                    __SEIEED::vector_type_t<uint16_t, N> pred,
+                                    __SEIEED::simd_mask_storage_t<N> pred,
                                     uint8_t numSrc0, uint8_t numSrc1,
                                     uint8_t sfid, uint32_t exDesc,
                                     uint32_t msgDesc,
@@ -1092,7 +1082,7 @@ inline void __esimd_raw_sends_store(uint8_t modifier, uint8_t execSize,
 ///
 template <typename Ty1, int N1, int N>
 inline void __esimd_raw_send_store(uint8_t modifier, uint8_t execSize,
-                                   __SEIEED::vector_type_t<uint16_t, N> pred,
+                                   __SEIEED::simd_mask_storage_t<N> pred,
                                    uint8_t numSrc0, uint8_t sfid,
                                    uint32_t exDesc, uint32_t msgDesc,
                                    __SEIEED::vector_type_t<Ty1, N1> msgSrc0) {
@@ -1100,6 +1090,3 @@ inline void __esimd_raw_send_store(uint8_t modifier, uint8_t execSize,
 }
 
 #endif // __SYCL_DEVICE_ONLY__
-
-#undef __SEIEED
-#undef __SEIEE
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/operators.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/operators.hpp
new file mode 100644
index 0000000000000..039d27ca95874
--- /dev/null
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/operators.hpp
@@ -0,0 +1,424 @@
+//==-------------- operators.hpp - DPC++ Explicit SIMD API -----------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Binary operator definitions for ESIMD types.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp>
+#include <sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp>
+#include <sycl/ext/intel/experimental/esimd/detail/types.hpp>
+
+#include <sycl/ext/intel/experimental/esimd/simd.hpp>
+#include <sycl/ext/intel/experimental/esimd/simd_view.hpp>
+
+// Table of contents:
+//
+// simd_obj_impl/simd/simd_mask global operators
+//   bitwise logic and arithmetic operators
+//     simd_obj_impl BINOP simd_obj_impl
+//     simd_obj_impl BINOP SCALAR
+//     SCALAR BINOP simd_obj_impl
+//   comparison operators
+//     simd_obj_impl CMPOP simd_obj_impl
+//     simd_obj_impl CMPOP SCALAR
+//     SCALAR CMPOP simd_obj_impl
+// simd_view global operators
+//   bitwise logic and arithmetic operators
+//     simd_view BINOP simd_view
+//     simd* BINOP simd_view<simd*...>
+//     simd_view<simd*...> BINOP simd*
+//     SCALAR BINOP simd_view
+//     simd_view BINOP SCALAR
+//   comparison operators
+//     simd_view CMPOP simd_view
+//     simd_view CMPOP simd_obj_impl
+//     simd_obj_impl CMPOP simd_view
+//     simd_view CMPOP SCALAR
+//     SCALAR CMPOP simd_view
+//
+// Some operations are enabled only for particular element and simd object type
+// (simd or simd_mask):
+// - bitwise logic operations - for integral element types (both simd and
+//   simd_mask)
+// - bit shift operations and and '%' - for the simd type (not for simd_mask)
+//   with integral element types.
+// - arithmetic binary operations - for the simd type (not for simd_mask)
+// In all cases, when an operation has a simd_view and a simd_obj_impl's
+// subclass objects as operands, it is enabled only when:
+// - simd_view's base type matches the simd object operand. I.e. only
+//   { simd_view<simd, ...>, simd } and { simd_view<simd_mask,...>, simd_mask }
+//   pairs are enabled (with any order of operand types).
+// - simd_view's value length matches the length of the simd object operand
+
+// Put operators into the ESIMD namespace to make argument-dependent lookup find
+// these operators instead of those defined in e.g. sycl namespace (which would
+// stop further lookup, leaving just non-viable sycl::operator < etc. on the
+// table).
+namespace __SEIEED {
+
+////////////////////////////////////////////////////////////////////////////////
+// simd_obj_impl global operators
+////////////////////////////////////////////////////////////////////////////////
+
+// ========= simd_obj_impl bitwise logic and arithmetic operators
+
+#define __ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(BINOP, COND)                          \
+                                                                               \
+  /* simd_obj_impl BINOP simd_obj_impl */                                      \
+  template <class T1, class T2, int N, template <class, int> class SimdT,      \
+            class SimdTx = SimdT<T1, N>, class = std::enable_if_t<COND>>       \
+  inline auto operator BINOP(                                                  \
+      const __SEIEED::simd_obj_impl<T1, N, SimdT<T1, N>> &LHS,                 \
+      const __SEIEED::simd_obj_impl<T2, N, SimdT<T2, N>> &RHS) {               \
+    if constexpr (__SEIEED::is_simd_type_v<SimdT<T1, N>>) {                    \
+      using SimdPromotedT =                                                    \
+          __SEIEED::computation_type_t<SimdT<T1, N>, SimdT<T2, N>>;            \
+      using VecT = typename SimdPromotedT::vector_type;                        \
+      return SimdPromotedT(__SEIEED::convert<VecT>(LHS.data())                 \
+                               BINOP __SEIEED::convert<VecT>(RHS.data()));     \
+    } else {                                                                   \
+      /* for SimdT=simd_mask_impl T1 and T2 are both equal to                  \
+       * simd_mask_elem_type */                                                \
+      return SimdT<T1, N>(LHS.data() BINOP RHS.data());                        \
+    }                                                                          \
+  }                                                                            \
+                                                                               \
+  /* simd_obj_impl BINOP SCALAR */                                             \
+  template <class T1, int N1, template <class, int> class SimdT1, class T2,    \
+            class SimdTx = SimdT1<T1, N1>, class = std::enable_if_t<COND>>     \
+  inline auto operator BINOP(                                                  \
+      const __SEIEED::simd_obj_impl<T1, N1, SimdT1<T1, N1>> &LHS, T2 RHS) {    \
+    if constexpr (__SEIEED::is_simd_type_v<SimdT1<T1, N1>>) {                  \
+      /* convert the SCALAR to vector type and reuse the basic operation over  \
+       * simd objects */                                                       \
+      return LHS BINOP SimdT1<T2, N1>(RHS);                                    \
+    } else {                                                                   \
+      /* SimdT1 is a mask, T1 is mask element type - convert RHS implicitly to \
+       * T1 */                                                                 \
+      return LHS BINOP SimdT1<T1, N1>(RHS);                                    \
+    }                                                                          \
+  }                                                                            \
+                                                                               \
+  /* SCALAR BINOP simd_obj_impl */                                             \
+  template <class T1, class T2, int N2, template <class, int> class SimdT2,    \
+            class SimdTx = SimdT2<T2, N2>, class = std::enable_if_t<COND>>     \
+  inline auto operator BINOP(                                                  \
+      T1 LHS, const __SEIEED::simd_obj_impl<T2, N2, SimdT2<T2, N2>> &RHS) {    \
+    if constexpr (__SEIEED::is_simd_type_v<SimdT2<T2, N2>>) {                  \
+      /* convert the SCALAR to vector type and reuse the basic operation over  \
+       * simd objects */                                                       \
+      return SimdT2<T1, N2>(LHS) BINOP RHS;                                    \
+    } else {                                                                   \
+      /* simd_mask_case */                                                     \
+      return SimdT2<T2, N2>(LHS) BINOP RHS;                                    \
+    }                                                                          \
+  }
+
+#define __ESIMD_BITWISE_OP_FILTER                                              \
+  std::is_integral_v<T1> &&std::is_integral_v<T2>
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(^, __ESIMD_BITWISE_OP_FILTER)
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(|, __ESIMD_BITWISE_OP_FILTER)
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(&, __ESIMD_BITWISE_OP_FILTER)
+#undef __ESIMD_BITWISE_OP_FILTER
+
+#define __ESIMD_SHIFT_OP_FILTER                                                \
+  std::is_integral_v<T1> &&std::is_integral_v<T2>                              \
+      &&__SEIEED::is_simd_type_v<SimdTx>
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(%, __ESIMD_SHIFT_OP_FILTER)
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(<<, __ESIMD_SHIFT_OP_FILTER)
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(>>, __ESIMD_SHIFT_OP_FILTER)
+#undef __ESIMD_SHIFT_OP_FILTER
+
+#define __ESIMD_ARITH_OP_FILTER                                                \
+  __SEIEED::is_vectorizable_v<T1> &&__SEIEED::is_vectorizable_v<T2>            \
+      &&__SEIEED::is_simd_type_v<SimdTx>
+
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(+, __ESIMD_ARITH_OP_FILTER)
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(-, __ESIMD_ARITH_OP_FILTER)
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(*, __ESIMD_ARITH_OP_FILTER)
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(/, __ESIMD_ARITH_OP_FILTER)
+#undef __ESIMD_ARITH_OP_FILTER
+
+#undef __ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP
+
+// ========= simd_obj_impl comparison operators
+// Both simd and simd_mask will match simd_obj_impl argument when resolving
+// operator overloads.
+
+#define __ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(CMPOP, COND)                          \
+                                                                               \
+  /* simd_obj_impl CMPOP simd_obj_impl */                                      \
+  template <class T1, class T2, int N, template <class, int> class SimdT,      \
+            class SimdTx = SimdT<T1, N>, class = std::enable_if_t<COND>>       \
+  inline __SEIEE::simd_mask<N> operator CMPOP(                                 \
+      const __SEIEED::simd_obj_impl<T1, N, SimdT<T1, N>> &LHS,                 \
+      const __SEIEED::simd_obj_impl<T2, N, SimdT<T2, N>> &RHS) {               \
+    using MaskVecT = typename __SEIEE::simd_mask<N>::vector_type;              \
+                                                                               \
+    if constexpr (__SEIEED::is_simd_type_v<SimdT<T1, N>>) {                    \
+      using PromSimdT =                                                        \
+          __SEIEED::computation_type_t<SimdT<T1, N>, SimdT<T2, N>>;            \
+      using PromVecT = typename PromSimdT::vector_type;                        \
+      auto ResVec = __SEIEED::convert<PromVecT>(LHS.data())                    \
+          CMPOP __SEIEED::convert<PromVecT>(RHS.data());                       \
+      return __SEIEE::simd_mask<N>(__SEIEED::convert<MaskVecT>(ResVec) &       \
+                                   MaskVecT(1));                               \
+    } else {                                                                   \
+      /* this is comparison of masks, don't perform type promotion */          \
+      auto ResVec = LHS.data() CMPOP RHS.data();                               \
+      return __SEIEE::simd_mask<N>(__SEIEED::convert<MaskVecT>(ResVec) &       \
+                                   MaskVecT(1));                               \
+    }                                                                          \
+  }                                                                            \
+                                                                               \
+  /* simd_obj_impl CMPOP SCALAR */                                             \
+  template <class T1, int N1, template <class, int> class SimdT1, class T2,    \
+            class SimdTx = SimdT1<T1, N1>,                                     \
+            class = std::enable_if_t<__SEIEED::is_vectorizable_v<T2> && COND>> \
+  inline __SEIEE::simd_mask<N1> operator CMPOP(                                \
+      const __SEIEED::simd_obj_impl<T1, N1, SimdT1<T1, N1>> &LHS, T2 RHS) {    \
+    if constexpr (__SEIEED::is_simd_type_v<SimdT1<T1, N1>>)                    \
+      /* simd case */                                                          \
+      return LHS CMPOP SimdT1<T2, N1>(RHS);                                    \
+    else                                                                       \
+      /* simd_mask case - element type is fixed */                             \
+      return LHS CMPOP SimdT1<T1, N1>((T1)RHS);                                \
+  }                                                                            \
+                                                                               \
+  /* SCALAR CMPOP simd_obj_impl */                                             \
+  template <class T1, class T2, int N2, template <class, int> class SimdT2,    \
+            class SimdTx = SimdT2<T2, N2>,                                     \
+            class = std::enable_if_t<__SEIEED::is_vectorizable_v<T1> && COND>> \
+  inline __SEIEE::simd_mask<N2> operator CMPOP(                                \
+      T1 LHS, const __SEIEED::simd_obj_impl<T2, N2, SimdT2<T2, N2>> &RHS) {    \
+    if constexpr (__SEIEED::is_simd_type_v<SimdT2<T2, N2>>)                    \
+      /* simd case */                                                          \
+      return SimdT2<T1, N2>(LHS) CMPOP RHS;                                    \
+    else                                                                       \
+      /* simd_mask case - element type is fixed */                             \
+      return SimdT2<T2, N2>((T2)LHS) CMPOP RHS;                                \
+  }
+
+// Equality comparison is defined for all simd_obj_impl subclasses.
+__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(==, true)
+__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(!=, true)
+
+// Relational operators are defined only for the simd type.
+__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(<, __SEIEED::is_simd_type_v<SimdTx>)
+__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(>, __SEIEED::is_simd_type_v<SimdTx>)
+__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(<=, __SEIEED::is_simd_type_v<SimdTx>)
+__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(>=, __SEIEED::is_simd_type_v<SimdTx>)
+
+// Logical operators are defined only for the simd_mask type
+__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(&&, __SEIEED::is_simd_mask_type_v<SimdTx>)
+__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(||, __SEIEED::is_simd_mask_type_v<SimdTx>)
+
+#undef __ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP
+} // namespace __SEIEED
+
+namespace __SEIEE {
+////////////////////////////////////////////////////////////////////////////////
+// simd_view global operators
+////////////////////////////////////////////////////////////////////////////////
+
+// ========= simd_view bitwise logic and arithmetic operators
+
+#define __ESIMD_DEF_SIMD_VIEW_BIN_OP(BINOP, COND)                              \
+                                                                               \
+  /* simd_view BINOP simd_view */                                              \
+  template <class SimdT1, class RegionT1, class SimdT2, class RegionT2,        \
+            class T1 = typename __SEIEE::shape_type<RegionT1>::element_type,   \
+            class T2 = typename __SEIEE::shape_type<RegionT2>::element_type,   \
+            auto N1 = __SEIEE::shape_type<RegionT1>::length,                   \
+            auto N2 = __SEIEE::shape_type<RegionT2>::length,                   \
+            class =                                                            \
+                std::enable_if_t<__SEIEED::is_simd_type_v<SimdT1> ==           \
+                                     __SEIEED::is_simd_type_v<SimdT2> &&       \
+                                 (N1 == N2 || N1 == 1 || N2 == 1) && COND>>    \
+  inline auto operator BINOP(                                                  \
+      const __SEIEE::simd_view<SimdT1, RegionT1> &LHS,                         \
+      const __SEIEE::simd_view<SimdT2, RegionT2> &RHS) {                       \
+    if constexpr (N1 == 1)                                                     \
+      return (T1)LHS.read()[0] BINOP RHS.read();                               \
+    else if constexpr (N2 == 1)                                                \
+      return LHS.read() BINOP(T2) RHS.read()[0];                               \
+    else                                                                       \
+      return LHS.read() BINOP RHS.read();                                      \
+  }                                                                            \
+                                                                               \
+  /* simd* BINOP simd_view<simd*...> */                                        \
+  template <class SimdT1, class SimdT2, class RegionT2,                        \
+            class T1 = typename SimdT1::element_type,                          \
+            class T2 = typename __SEIEE::shape_type<RegionT2>::element_type,   \
+            class = std::enable_if_t<                                          \
+                __SEIEED::is_simd_obj_impl_derivative_v<SimdT1> &&             \
+                (__SEIEED::is_simd_type_v<SimdT1> ==                           \
+                 __SEIEED::is_simd_type_v<SimdT2>)&&(SimdT1::length ==         \
+                                                     __SEIEE::shape_type<      \
+                                                         RegionT2>::length) && \
+                COND>>                                                         \
+  inline auto operator BINOP(                                                  \
+      const SimdT1 &LHS, const __SEIEE::simd_view<SimdT2, RegionT2> &RHS) {    \
+    return LHS BINOP RHS.read();                                               \
+  }                                                                            \
+                                                                               \
+  /* simd_view<simd*...> BINOP simd* */                                        \
+  template <                                                                   \
+      class SimdT1, class RegionT1, class SimdT2,                              \
+      class T1 = typename __SEIEE::shape_type<RegionT1>::element_type,         \
+      class T2 = typename SimdT2::element_type,                                \
+      class = std::enable_if_t<                                                \
+          __SEIEED::is_simd_obj_impl_derivative_v<SimdT2> &&                   \
+          __SEIEED::is_simd_type_v<SimdT1> ==                                  \
+              __SEIEED::is_simd_type_v<SimdT2> &&                              \
+          (SimdT2::length == __SEIEE::shape_type<RegionT1>::length) && COND>>  \
+  inline auto operator BINOP(const __SEIEE::simd_view<SimdT1, RegionT1> &LHS,  \
+                             const SimdT2 &RHS) {                              \
+    return LHS.read() BINOP RHS;                                               \
+  }                                                                            \
+                                                                               \
+  /* SCALAR BINOP simd_view */                                                 \
+  template <class T1, class SimdViewT2,                                        \
+            class T2 = typename SimdViewT2::element_type,                      \
+            class SimdT1 = typename SimdViewT2::value_type,                    \
+            class = std::enable_if_t<                                          \
+                __SEIEED::is_any_simd_view_type_v<SimdViewT2> && COND>>        \
+  inline auto operator BINOP(T1 LHS, const SimdViewT2 &RHS) {                  \
+    using SimdT = typename SimdViewT2::value_type;                             \
+    return SimdT(LHS) BINOP RHS.read();                                        \
+  }                                                                            \
+                                                                               \
+  /* simd_view BINOP SCALAR */                                                 \
+  template <class SimdViewT1, class T2,                                        \
+            class T1 = typename SimdViewT1::element_type,                      \
+            class SimdT1 = typename SimdViewT1::value_type,                    \
+            class = std::enable_if_t<                                          \
+                __SEIEED::is_any_simd_view_type_v<SimdViewT1> && COND>>        \
+  inline auto operator BINOP(const SimdViewT1 &LHS, T2 RHS) {                  \
+    using SimdT = typename SimdViewT1::value_type;                             \
+    return LHS.read() BINOP SimdT(RHS);                                        \
+  }
+
+#define __ESIMD_BITWISE_OP_FILTER                                              \
+  std::is_integral_v<T1> &&std::is_integral_v<T2>
+__ESIMD_DEF_SIMD_VIEW_BIN_OP(^, __ESIMD_BITWISE_OP_FILTER)
+__ESIMD_DEF_SIMD_VIEW_BIN_OP(|, __ESIMD_BITWISE_OP_FILTER)
+__ESIMD_DEF_SIMD_VIEW_BIN_OP(&, __ESIMD_BITWISE_OP_FILTER)
+#undef __ESIMD_BITWISE_OP_FILITER
+
+#define __ESIMD_SHIFT_OP_FILTER                                                \
+  std::is_integral_v<T1> &&std::is_integral_v<T2>                              \
+      &&__SEIEED::is_simd_type_v<SimdT1>
+
+__ESIMD_DEF_SIMD_VIEW_BIN_OP(%, __ESIMD_SHIFT_OP_FILTER)
+__ESIMD_DEF_SIMD_VIEW_BIN_OP(<<, __ESIMD_SHIFT_OP_FILTER)
+__ESIMD_DEF_SIMD_VIEW_BIN_OP(>>, __ESIMD_SHIFT_OP_FILTER)
+#undef __ESIMD_SHIFT_OP_FILTER
+
+#define __ESIMD_ARITH_OP_FILTER                                                \
+  __SEIEED::is_simd_type_v<SimdT1> &&__SEIEED::is_vectorizable_v<T1>           \
+      &&__SEIEED::is_vectorizable_v<T2>
+
+__ESIMD_DEF_SIMD_VIEW_BIN_OP(+, __ESIMD_ARITH_OP_FILTER)
+__ESIMD_DEF_SIMD_VIEW_BIN_OP(-, __ESIMD_ARITH_OP_FILTER)
+__ESIMD_DEF_SIMD_VIEW_BIN_OP(*, __ESIMD_ARITH_OP_FILTER)
+__ESIMD_DEF_SIMD_VIEW_BIN_OP(/, __ESIMD_ARITH_OP_FILTER)
+#undef __ESIMD_ARITH_OP_FILTER
+
+__ESIMD_DEF_SIMD_VIEW_BIN_OP(&&, __SEIEED::is_simd_mask_type_v<SimdT1>)
+__ESIMD_DEF_SIMD_VIEW_BIN_OP(||, __SEIEED::is_simd_mask_type_v<SimdT1>)
+
+#undef __ESIMD_DEF_SIMD_VIEW_BIN_OP
+
+// ========= simd_view comparison operators
+
+#define __ESIMD_DEF_SIMD_VIEW_CMP_OP(CMPOP, COND)                              \
+                                                                               \
+  /* simd_view CMPOP simd_view */                                              \
+  template <class SimdT1, class RegionT1, class SimdT2, class RegionT2,        \
+            auto N1 = __SEIEE::shape_type<RegionT1>::length,                   \
+            auto N2 = __SEIEE::shape_type<RegionT2>::length,                   \
+            class = std::enable_if_t</* both views must have the same base     \
+                                        type kind - simds or masks: */         \
+                                     (__SEIEED::is_simd_type_v<SimdT1> ==      \
+                                      __SEIEED::is_simd_type_v<                \
+                                          SimdT2>)&&/* the length of the views \
+                                                       must match as well: */  \
+                                     (N1 == N2 || N1 == 1 || N2 == 1) &&       \
+                                     COND>>                                    \
+  inline auto operator CMPOP(                                                  \
+      const __SEIEE::simd_view<SimdT1, RegionT1> &LHS,                         \
+      const __SEIEE::simd_view<SimdT2, RegionT2> &RHS) {                       \
+    using T1 = typename __SEIEE::shape_type<RegionT1>::element_type;           \
+    using T2 = typename __SEIEE::shape_type<RegionT2>::element_type;           \
+    if constexpr (N1 == 1)                                                     \
+      return (T1)LHS.read()[0] CMPOP RHS.read();                               \
+    else if constexpr (N2 == 1)                                                \
+      return LHS.read() CMPOP(T2) RHS.read()[0];                               \
+    else                                                                       \
+      return LHS.read() CMPOP RHS.read();                                      \
+  }                                                                            \
+                                                                               \
+  /* simd_view CMPOP simd_obj_impl */                                          \
+  template <class SimdT1, class RegionT1, class T2, int N2, class SimdT2,      \
+            class = std::enable_if_t<                                          \
+                (__SEIEE::shape_type<RegionT1>::length == N2) &&               \
+                (__SEIEED::is_simd_type_v<SimdT1> ==                           \
+                 __SEIEED::is_simd_type_v<SimdT2>)&&COND>>                     \
+  inline __SEIEE::simd_mask<N2> operator CMPOP(                                \
+      const __SEIEE::simd_view<SimdT1, RegionT1> &LHS,                         \
+      const __SEIEED::simd_obj_impl<T2, N2, SimdT2> &RHS) {                    \
+    return LHS.read() CMPOP SimdT2(RHS.data());                                \
+  }                                                                            \
+                                                                               \
+  /* simd_obj_impl CMPOP simd_view */                                          \
+  template <class T1, int N1, class SimdT1, class SimdT2, class RegionT2,      \
+            class = std::enable_if_t<                                          \
+                (__SEIEE::shape_type<RegionT2>::length == N1) &&               \
+                (__SEIEED::is_simd_type_v<SimdT1> ==                           \
+                 __SEIEED::is_simd_type_v<SimdT2>)&&COND>>                     \
+  inline __SEIEE::simd_mask<N1> operator CMPOP(                                \
+      const __SEIEED::simd_obj_impl<T1, N1, SimdT1> &LHS,                      \
+      const __SEIEE::simd_view<SimdT2, RegionT2> &RHS) {                       \
+    return SimdT1(LHS.data()) CMPOP RHS.read();                                \
+  }                                                                            \
+                                                                               \
+  /* simd_view CMPOP SCALAR */                                                 \
+  template <class SimdT1, class RegionT1, class T2,                            \
+            class = std::enable_if_t<__SEIEED::is_vectorizable_v<T2> && COND>> \
+  inline auto operator CMPOP(const __SEIEE::simd_view<SimdT1, RegionT1> &LHS,  \
+                             T2 RHS) {                                         \
+    using SimdValueT =                                                         \
+        typename __SEIEE::simd_view<SimdT1, RegionT1>::value_type;             \
+    return LHS.read() CMPOP SimdValueT(RHS);                                   \
+  }                                                                            \
+                                                                               \
+  /* SCALAR CMPOP simd_view */                                                 \
+  template <class T1, class SimdT2, class RegionT2, class SimdT1 = SimdT2,     \
+            class = std::enable_if_t<__SEIEED::is_vectorizable_v<T1> && COND>> \
+  inline auto operator CMPOP(                                                  \
+      T1 LHS, const __SEIEE::simd_view<SimdT2, RegionT2> &RHS) {               \
+    using SimdValueT =                                                         \
+        typename __SEIEE::simd_view<SimdT2, RegionT2>::value_type;             \
+    return SimdValueT(LHS) CMPOP RHS.read();                                   \
+  }
+
+// Equality comparison is defined for views of all simd_obj_impl derivatives.
+__ESIMD_DEF_SIMD_VIEW_CMP_OP(==, true)
+__ESIMD_DEF_SIMD_VIEW_CMP_OP(!=, true)
+
+// Relational operators are defined only for views of the simd class.
+__ESIMD_DEF_SIMD_VIEW_CMP_OP(<, __SEIEED::is_simd_type_v<SimdT1>)
+__ESIMD_DEF_SIMD_VIEW_CMP_OP(>, __SEIEED::is_simd_type_v<SimdT1>)
+__ESIMD_DEF_SIMD_VIEW_CMP_OP(<=, __SEIEED::is_simd_type_v<SimdT1>)
+__ESIMD_DEF_SIMD_VIEW_CMP_OP(>=, __SEIEED::is_simd_type_v<SimdT1>)
+
+#undef __ESIMD_DEF_SIMD_VIEW_CMP_OP
+
+} // namespace __SEIEE
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/region.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/region.hpp
index 39092fea83616..df541f4e81d14 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/region.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/region.hpp
@@ -50,15 +50,16 @@ struct region_base {
 
 // A basic 1D region type.
 template <typename T, int Size, int Stride>
-using region1d_t = region_base<false, T, 1, 0, Size, Stride>;
+using region1d_t = region_base<false, T, 1, 1, Size, Stride>;
 
 // A basic 2D region type.
 template <typename T, int SizeY, int StrideY, int SizeX, int StrideX>
 using region2d_t = region_base<true, T, SizeY, StrideY, SizeX, StrideX>;
 
 // A region with a single element.
-template <typename T, int StrideY, int StrideX>
-using region1d_scalar_t = region_base<false, T, 1, StrideY, 1, StrideX>;
+template <typename T>
+using region1d_scalar_t =
+    region_base<false, T, 1 /*SizeY*/, 1 /*StrideY*/, 1, 1>;
 
 // simd_view forward declaration.
 template <typename BaseTy, typename RegionTy> class simd_view;
@@ -92,12 +93,20 @@ template <typename Ty, int Size, int Stride>
 struct shape_type<region1d_t<Ty, Size, Stride>> {
   using element_type = Ty;
   using type = region1d_t<Ty, Size, Stride>;
+  static inline constexpr int length = type::length;
+};
+
+template <typename Ty> struct shape_type<region1d_scalar_t<Ty>> {
+  using element_type = Ty;
+  using type = region1d_t<Ty, 1, 1>;
+  static inline constexpr int length = type::length;
 };
 
 template <typename Ty, int SizeY, int StrideY, int SizeX, int StrideX>
 struct shape_type<region2d_t<Ty, SizeY, StrideY, SizeX, StrideX>> {
   using element_type = Ty;
   using type = region2d_t<Ty, SizeY, StrideY, SizeX, StrideX>;
+  static inline constexpr int length = type::length;
 };
 
 // Forward the shape computation on the top region type.
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp
new file mode 100644
index 0000000000000..407d592cf780c
--- /dev/null
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp
@@ -0,0 +1,128 @@
+//==------------ - simd_mask_impl.hpp - DPC++ Explicit SIMD API   ----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Implementation detail of Explicit SIMD mask class.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp>
+#include <sycl/ext/intel/experimental/esimd/detail/types.hpp>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace ext {
+namespace intel {
+namespace experimental {
+namespace esimd {
+namespace detail {
+
+#define __ESIMD_MASK_DEPRECATION_MSG                                           \
+  "Use of 'simd'/'simd_view<simd,...>' class to represent predicate or mask "  \
+  "is deprecated. Use "                                                        \
+  "'simd_mask'/'simd_view<simd_mask,...>' instead."
+
+template <typename T, int N>
+class simd_mask_impl
+    : public detail::simd_obj_impl<
+          T, N, simd_mask_impl<T, N>,
+          std::enable_if_t<std::is_same_v<detail::simd_mask_elem_type, T>>> {
+  using base_type = detail::simd_obj_impl<T, N, simd_mask_impl<T, N>>;
+
+public:
+  using element_type = T;
+  using vector_type = typename base_type::vector_type;
+  static_assert(std::is_same_v<vector_type, simd_mask_storage_t<N>> &&
+                "mask impl type mismatch");
+
+  simd_mask_impl() = default;
+  simd_mask_impl(const simd_mask_impl &other) : base_type(other) {}
+
+  /// Broadcast constructor with conversion.
+  template <class T1, class = std::enable_if_t<std::is_integral_v<T1>>>
+  simd_mask_impl(T1 Val) : base_type((T)Val) {}
+
+  /// Implicit conversion constructor from a raw vector object.
+  // TODO this should be made inaccessible from user code.
+  simd_mask_impl(const vector_type &Val) : base_type(Val) {}
+
+  /// Initializer list constructor.
+  __SYCL_DEPRECATED("use constructor from array, e.g: simd_mask<3> x({0,1,1});")
+  simd_mask_impl(std::initializer_list<T> Ilist) : base_type(Ilist) {}
+
+  /// Construct from an array. To allow e.g. simd_mask<N> m({1,0,0,1,...}).
+  template <int N1, class = std::enable_if_t<N1 == N>>
+  simd_mask_impl(const element_type(&&Arr)[N1]) {
+    base_type::template init_from_array<N1>(std::move(Arr));
+  }
+
+  /// Implicit conversion from simd.
+  __SYCL_DEPRECATED(__ESIMD_MASK_DEPRECATION_MSG)
+  simd_mask_impl(const simd<T, N> &Val) : base_type(Val.data()) {}
+
+  /// Implicit conversion from simd_view<simd,...>.
+  template <
+      // viewed simd class parameters
+      int N1, class T1,
+      // view region
+      class RegionT2,
+      // view element type
+      class T2 = typename __SEIEE::shape_type<RegionT2>::element_type,
+      // view size in elements
+      int N2 = __SEIEE::shape_type<RegionT2>::length,
+      // enable only if view length and element type match this object
+      class = std::enable_if_t<N == N2 && std::is_same_v<T, T2>>>
+  __SYCL_DEPRECATED(__ESIMD_MASK_DEPRECATION_MSG)
+  simd_mask_impl(const simd_view<simd<T1, N1>, RegionT2> &Val)
+      : base_type(Val.read().data()) {}
+
+private:
+  static inline constexpr bool mask_size_ok_for_mem_io() {
+    constexpr unsigned Sz = sizeof(element_type) * N;
+    return (Sz >= detail::OperandSize::OWORD) &&
+           (Sz % detail::OperandSize::OWORD == 0) &&
+           detail::isPowerOf2(Sz / detail::OperandSize::OWORD) &&
+           (Sz <= 8 * detail::OperandSize::OWORD);
+  }
+
+public:
+  // TODO add accessor-based mask memory operations.
+
+  /// Load constructor.
+  // Implementation note: use SFINAE to avoid overload ambiguity:
+  // 1) with 'simd_mask(element_type v)' in 'simd_mask<N> m(0)'
+  // 2) with 'simd_mask(const T1(&&arr)[N])' in simd_mask<N>
+  // m((element_type*)p)'
+  template <typename T1,
+            typename = std::enable_if_t<mask_size_ok_for_mem_io() &&
+                                        std::is_same_v<T1, element_type>>>
+  explicit simd_mask_impl(const T1 *ptr) {
+    base_type::copy_from(ptr);
+  }
+
+  /// Broadcast assignment operator to support simd_mask_impl<N> n = a > b;
+  simd_mask_impl &operator=(element_type val) noexcept {
+    base_type::set(val);
+    return *this;
+  }
+
+  template <class T1 = simd_mask_impl,
+            class = std::enable_if_t<T1::length == 1>>
+  operator bool() {
+    return base_type::data()[0] != 0;
+  }
+};
+
+#undef __ESIMD_MASK_DEPRECATION_MSG
+
+} // namespace detail
+} // namespace esimd
+} // namespace experimental
+} // namespace intel
+} // namespace ext
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp
new file mode 100644
index 0000000000000..e7bb5ff37eace
--- /dev/null
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp
@@ -0,0 +1,720 @@
+//==------------ - simd_obj_impl.hpp - DPC++ Explicit SIMD API -------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Implement Explicit SIMD vector APIs.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/ext/intel/experimental/esimd/detail/intrin.hpp>
+#include <sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp>
+#include <sycl/ext/intel/experimental/esimd/detail/sycl_util.hpp>
+#include <sycl/ext/intel/experimental/esimd/detail/types.hpp>
+#include <sycl/ext/intel/experimental/esimd/simd_view.hpp>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace ext {
+namespace intel {
+namespace experimental {
+namespace esimd {
+namespace detail {
+
+/// The simd_obj_impl vector class.
+///
+/// This is a base class for all ESIMD simd classes with real storage (simd,
+/// simd_mask_impl). It wraps a clang vector as the storage for the elements.
+/// Additionally this class supports region operations that map to Intel GPU
+/// regions. The type of a region select or bit_cast_view operation is of
+/// simd_view type, which models read-update-write semantics.
+///
+/// For the is_simd_obj_impl_derivative helper to work correctly, all derived
+/// classes must be templated by element type and number of elements. If fewer
+/// template arguments are needed, template aliases can be used
+/// (simd_mask_type).
+///
+/// \tparam Ty the element type
+/// \tparam N number of elements
+/// \tparam Derived - a class derived from this one; this class and its
+///    derivatives must follow the 'curiously recurring template' pattern.
+/// \tparam SFINAE - defaults to 'void' in the forward declarion within
+///    types.hpp, used to disable invalid specializations.
+///
+/// \ingroup sycl_esimd
+template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
+  template <typename, typename> friend class simd_view;
+  template <typename, int> friend class simd;
+  template <typename, int> friend class simd_mask_impl;
+
+public:
+  /// The underlying builtin data type.
+  using vector_type = vector_type_t<Ty, N>;
+
+  /// The element type of this simd_obj_impl object.
+  using element_type = Ty;
+
+  /// The number of elements in this simd_obj_impl object.
+  static constexpr int length = N;
+
+protected:
+  template <int N1, class = std::enable_if_t<N1 == N>>
+  void init_from_array(const Ty(&&Arr)[N1]) noexcept {
+    for (auto I = 0; I < N; ++I) {
+      M_data[I] = Arr[I];
+    }
+  }
+
+private:
+  Derived &cast_this_to_derived() { return reinterpret_cast<Derived &>(*this); }
+
+public:
+  /// @{
+  /// Constructors.
+  simd_obj_impl() = default;
+  simd_obj_impl(const simd_obj_impl &other) {
+    __esimd_dbg_print(simd_obj_impl(const simd_obj_impl &other));
+    set(other.data());
+  }
+
+  /// Implicit conversion constructor from another \c simd_obj_impl object.
+  template <typename SrcTy>
+  simd_obj_impl(
+      const simd_obj_impl<SrcTy, N, convert_simd_elem_type_t<Derived, SrcTy>,
+                          SFINAE> &other) {
+    __esimd_dbg_print(simd_obj_impl(const simd_obj_impl... > &other));
+    if constexpr (std::is_same_v<SrcTy, Ty>)
+      set(other.data());
+    else
+      set(__builtin_convertvector(other.data(), vector_type));
+  }
+
+  /// Implicit conversion constructor from a raw vector object.
+  simd_obj_impl(const vector_type &Val) {
+    __esimd_dbg_print(simd_obj_impl(const vector_type &Val));
+    set(Val);
+  }
+
+  /// This constructor is deprecated for two reasons:
+  /// 1) it adds confusion between
+  ///   simd s1(1,2); //calls next constructor
+  ///   simd s2{1,2}; //calls this constructor (uniform initialization syntax)
+  /// 2) no compile-time control over the size of the initializer; e.g. the
+  ///    following will compile:
+  ///   simd<int, 2> x = {1, 2, 3, 4};
+  __SYCL_DEPRECATED("use constructor from array, e.g: simd<int,3> x({1,2,3});")
+  simd_obj_impl(std::initializer_list<Ty> Ilist) noexcept {
+    __esimd_dbg_print(simd_obj_impl(std::initializer_list<Ty> Ilist));
+    int i = 0;
+    for (auto It = Ilist.begin(); It != Ilist.end() && i < N; ++It) {
+      M_data[i++] = *It;
+    }
+  }
+
+  /// Initialize a simd_obj_impl object with an initial value and step.
+  simd_obj_impl(Ty Val, Ty Step) noexcept {
+    __esimd_dbg_print(simd_obj_impl(Ty Val, Ty Step));
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      M_data[i] = Val;
+      Val += Step;
+    }
+  }
+
+  /// Broadcast constructor
+  simd_obj_impl(Ty Val) noexcept {
+    __esimd_dbg_print(simd_obj_impl(Ty Val));
+    M_data = Val;
+  }
+
+  /// Construct from an array. To allow e.g. simd_mask_type<N> m({1,0,0,1,...}).
+  template <int N1, class = std::enable_if_t<N1 == N>>
+  simd_obj_impl(const Ty(&&Arr)[N1]) noexcept {
+    __esimd_dbg_print(simd_obj_impl(const Ty(&&Arr)[N1]));
+    init_from_array(std::move(Arr));
+  }
+
+  /// @}
+
+  // Load the object's value from array.
+  template <int N1> std::enable_if_t<N1 == N> copy_from(const Ty (&Arr)[N1]) {
+    __esimd_dbg_print(copy_from(const Ty(&Arr)[N1]));
+    vector_type Tmp;
+    for (auto I = 0; I < N; ++I) {
+      Tmp[I] = Arr[I];
+    }
+    set(Tmp);
+  }
+
+  // Store the object's value to array.
+  template <int N1> std::enable_if_t<N1 == N> copy_to(Ty (&Arr)[N1]) const {
+    __esimd_dbg_print(copy_to(Ty(&Arr)[N1]));
+    for (auto I = 0; I < N; ++I) {
+      Arr[I] = data()[I];
+    }
+  }
+
+  /// @{
+  /// Conversion operators.
+  explicit operator const vector_type &() const & {
+    __esimd_dbg_print(explicit operator const vector_type &() const &);
+    return M_data;
+  }
+  explicit operator vector_type &() & {
+    __esimd_dbg_print(explicit operator vector_type &() &);
+    return M_data;
+  }
+
+  /// Explicit conversion for simd_obj_impl<T, 1> into T.
+  template <typename T = simd_obj_impl,
+            typename = sycl::detail::enable_if_t<T::length == 1>>
+  operator Ty() const {
+    __esimd_dbg_print(explicit operator Ty());
+    return data()[0];
+  }
+  /// @}
+
+  vector_type data() const {
+    __esimd_dbg_print(vector_type data());
+#ifndef __SYCL_DEVICE_ONLY__
+    return M_data;
+#else
+    return __esimd_vload<Ty, N>(&M_data);
+#endif
+  }
+
+  /// Whole region read.
+  Derived read() const { return Derived{data()}; }
+
+  /// Whole region write.
+  Derived &write(const Derived &Val) {
+    set(Val.data());
+    return cast_this_to_derived();
+  }
+
+  /// Whole region update with predicates.
+  void merge(const Derived &Val, const simd_mask_type<N> &Mask) {
+    set(__esimd_wrregion<Ty, N, N, 0 /*VS*/, N, 1, N>(data(), Val.data(), 0,
+                                                      Mask.data()));
+  }
+
+  void merge(const Derived &Val1, Derived Val2, const simd_mask_type<N> &Mask) {
+    Val2.merge(Val1, Mask);
+    set(Val2.data());
+  }
+
+  /// View this simd_obj_impl object in a different element type.
+  template <typename EltTy> auto bit_cast_view() &[[clang::lifetimebound]] {
+    using TopRegionTy = compute_format_type_t<simd_obj_impl, EltTy>;
+    using RetTy = simd_view<Derived, TopRegionTy>;
+    return RetTy{cast_this_to_derived(), TopRegionTy{0}};
+  }
+
+  template <typename EltTy>
+  __SYCL_DEPRECATED("use simd_obj_impl::bit_cast_view.")
+  auto format() & {
+    return bit_cast_view<EltTy>();
+  }
+
+  /// View as a 2-dimensional simd_view.
+  template <typename EltTy, int Height, int Width>
+  auto bit_cast_view() &[[clang::lifetimebound]] {
+    using TopRegionTy =
+        compute_format_type_2d_t<simd_obj_impl, EltTy, Height, Width>;
+    using RetTy = simd_view<Derived, TopRegionTy>;
+    return RetTy{cast_this_to_derived(), TopRegionTy{0, 0}};
+  }
+
+  template <typename EltTy, int Height, int Width>
+  __SYCL_DEPRECATED("use simd_obj_impl::bit_cast_view.")
+  auto format() & {
+    return bit_cast_view<EltTy, Height, Width>();
+  }
+
+  /// 1D region select, apply a region on top of this LValue object.
+  ///
+  /// \tparam Size is the number of elements to be selected.
+  /// \tparam Stride is the element distance between two consecutive elements.
+  /// \param Offset is the starting element offset.
+  /// \return the representing region object.
+  template <int Size, int Stride>
+  simd_view<Derived, region1d_t<Ty, Size, Stride>>
+  select(uint16_t Offset = 0) &[[clang::lifetimebound]] {
+    static_assert(Size > 1 || Stride == 1,
+                  "Stride must be 1 in single-element region");
+    region1d_t<Ty, Size, Stride> Reg(Offset);
+    return {cast_this_to_derived(), std::move(Reg)};
+  }
+
+  /// 1D region select, apply a region on top of this RValue object.
+  ///
+  /// \tparam Size is the number of elements to be selected.
+  /// \tparam Stride is the element distance between two consecutive elements.
+  /// \param Offset is the starting element offset.
+  /// \return the value this region object refers to.
+  template <int Size, int Stride>
+  resize_a_simd_type_t<Derived, Size> select(uint16_t Offset = 0) && {
+    static_assert(Size > 1 || Stride == 1,
+                  "Stride must be 1 in single-element region");
+    Derived &&Val = std::move(cast_this_to_derived());
+    return __esimd_rdregion<Ty, N, Size, /*VS*/ 0, Size, Stride>(Val.data(),
+                                                                 Offset);
+  }
+
+  /// Read single element, return value only (not reference).
+  Ty operator[](int i) const { return data()[i]; }
+
+  /// Read single element, return value only (not reference).
+  __SYCL_DEPRECATED("use operator[] form.")
+  Ty operator()(int i) const { return data()[i]; }
+
+  /// Return writable view of a single element.
+  simd_view<Derived, region1d_scalar_t<Ty>> operator[](int i)
+      [[clang::lifetimebound]] {
+    return select<1, 1>(i);
+  }
+
+  /// Return writable view of a single element.
+  __SYCL_DEPRECATED("use operator[] form.")
+  simd_view<Derived, region1d_scalar_t<Ty>> operator()(int i) {
+    return select<1, 1>(i);
+  }
+
+  // TODO ESIMD_EXPERIMENTAL
+  /// Read multiple elements by their indices in vector
+  template <int Size>
+  resize_a_simd_type_t<Derived, Size>
+  iselect(const simd<uint16_t, Size> &Indices) {
+    vector_type_t<uint16_t, Size> Offsets = Indices.data() * sizeof(Ty);
+    return __esimd_rdindirect<Ty, N, Size>(data(), Offsets);
+  }
+  // TODO ESIMD_EXPERIMENTAL
+  /// update single element
+  void iupdate(ushort Index, Ty V) {
+    auto Val = data();
+    Val[Index] = V;
+    set(Val);
+  }
+
+  // TODO ESIMD_EXPERIMENTAL
+  /// update multiple elements by their indices in vector
+  template <int Size>
+  void iupdate(const simd<uint16_t, Size> &Indices,
+               const resize_a_simd_type_t<Derived, Size> &Val,
+               const simd_mask_type<Size> &Mask) {
+    vector_type_t<uint16_t, Size> Offsets = Indices.data() * sizeof(Ty);
+    set(__esimd_wrindirect<Ty, N, Size>(data(), Val.data(), Offsets,
+                                        Mask.data()));
+  }
+
+  /// \name Replicate
+  /// Replicate simd_obj_impl instance given a region.
+  /// @{
+  ///
+
+  /// \tparam Rep is number of times region has to be replicated.
+  /// \return replicated simd_obj_impl instance.
+  template <int Rep> resize_a_simd_type_t<Derived, Rep * N> replicate() {
+    return replicate<Rep, N>(0);
+  }
+
+  /// \tparam Rep is number of times region has to be replicated.
+  /// \tparam W is width of src region to replicate.
+  /// \param Offset is offset in number of elements in src region.
+  /// \return replicated simd_obj_impl instance.
+  template <int Rep, int W>
+  __SYCL_DEPRECATED("use simd_obj_impl::replicate_w")
+  resize_a_simd_type_t<Derived, Rep * W> replicate(uint16_t Offset) {
+    return replicate_w<Rep, W>(Offset);
+  }
+
+  /// \tparam Rep is number of times region has to be replicated.
+  /// \tparam W is width of src region to replicate.
+  /// \param Offset is offset in number of elements in src region.
+  /// \return replicated simd_obj_impl instance.
+  template <int Rep, int W>
+  resize_a_simd_type_t<Derived, Rep * W> replicate_w(uint16_t Offset) {
+    return replicate_vs_w_hs<Rep, 0, W, 1>(Offset);
+  }
+
+  /// \tparam Rep is number of times region has to be replicated.
+  /// \tparam VS vertical stride of src region to replicate.
+  /// \tparam W is width of src region to replicate.
+  /// \param Offset is offset in number of elements in src region.
+  /// \return replicated simd_obj_impl instance.
+  template <int Rep, int VS, int W>
+  __SYCL_DEPRECATED("use simd_obj_impl::replicate_vs_w")
+  resize_a_simd_type_t<Derived, Rep * W> replicate(uint16_t Offset) {
+    return replicate_vs_w<Rep, VS, W>(Offset);
+  }
+
+  /// \tparam Rep is number of times region has to be replicated.
+  /// \tparam VS vertical stride of src region to replicate.
+  /// \tparam W width of src region to replicate.
+  /// \param Offset offset in number of elements in src region.
+  /// \return replicated simd_obj_impl instance.
+  template <int Rep, int VS, int W>
+  resize_a_simd_type_t<Derived, Rep * W> replicate_vs_w(uint16_t Offset) {
+    return replicate_vs_w_hs<Rep, VS, W, 1>(Offset);
+  }
+
+  /// \tparam Rep is number of times region has to be replicated.
+  /// \tparam VS vertical stride of src region to replicate.
+  /// \tparam W is width of src region to replicate.
+  /// \tparam HS horizontal stride of src region to replicate.
+  /// \param Offset is offset in number of elements in src region.
+  /// \return replicated simd_obj_impl instance.
+  template <int Rep, int VS, int W, int HS>
+  __SYCL_DEPRECATED("use simd_obj_impl::replicate_vs_w_hs")
+  resize_a_simd_type_t<Derived, Rep * W> replicate(uint16_t Offset) {
+    return replicate_vs_w_hs<Rep, VS, W, HS>(Offset);
+  }
+
+  /// \tparam Rep is number of times region has to be replicated.
+  /// \tparam VS vertical stride of src region to replicate.
+  /// \tparam W is width of src region to replicate.
+  /// \tparam HS horizontal stride of src region to replicate.
+  /// \param Offset is offset in number of elements in src region.
+  /// \return replicated simd_obj_impl instance.
+  template <int Rep, int VS, int W, int HS>
+  resize_a_simd_type_t<Derived, Rep * W> replicate_vs_w_hs(uint16_t Offset) {
+    return __esimd_rdregion<Ty, N, Rep * W, VS, W, HS, N>(data(),
+                                                          Offset * sizeof(Ty));
+  }
+  ///@}
+
+  /// Any operation.
+  ///
+  /// \return 1 if any element is set, 0 otherwise.
+  template <typename T1 = Ty,
+            typename = sycl::detail::enable_if_t<std::is_integral<T1>::value>>
+  uint16_t any() {
+    return __esimd_any<Ty, N>(data());
+  }
+
+  /// All operation.
+  ///
+  /// \return 1 if all elements are set, 0 otherwise.
+  template <typename T1 = Ty,
+            typename = sycl::detail::enable_if_t<std::is_integral<T1>::value>>
+  uint16_t all() {
+    return __esimd_all<Ty, N>(data());
+  }
+
+  /// Write a simd_obj_impl-vector into a basic region of a simd_obj_impl
+  /// object.
+  template <typename RTy>
+  ESIMD_INLINE void writeRegion(
+      RTy Region,
+      const vector_type_t<typename RTy::element_type, RTy::length> &Val) {
+    using ElemTy = typename RTy::element_type;
+    if constexpr (N * sizeof(Ty) == RTy::length * sizeof(ElemTy))
+      // update the entire vector
+      set(bitcast<Ty, ElemTy, RTy::length>(Val));
+    else {
+      static_assert(!RTy::Is_2D);
+      // If element type differs, do bitcast conversion first.
+      auto Base = bitcast<ElemTy, Ty, N>(data());
+      constexpr int BN = (N * sizeof(Ty)) / sizeof(ElemTy);
+      // Access the region information.
+      constexpr int M = RTy::Size_x;
+      constexpr int Stride = RTy::Stride_x;
+      uint16_t Offset = Region.M_offset_x * sizeof(ElemTy);
+
+      // Merge and update.
+      auto Merged = __esimd_wrregion<ElemTy, BN, M,
+                                     /*VS*/ 0, M, Stride>(Base, Val, Offset);
+      // Convert back to the original element type, if needed.
+      set(bitcast<Ty, ElemTy, BN>(Merged));
+    }
+  }
+
+  /// Write a simd_obj_impl-vector into a nested region of a simd_obj_impl
+  /// object.
+  template <typename TR, typename UR>
+  ESIMD_INLINE void
+  writeRegion(std::pair<TR, UR> Region,
+              const vector_type_t<typename TR::element_type, TR::length> &Val) {
+    // parent-region type
+    using PaTy = typename shape_type<UR>::type;
+    using ElemTy = typename TR::element_type;
+    using BT = typename PaTy::element_type;
+    constexpr int BN = PaTy::length;
+
+    if constexpr (PaTy::Size_in_bytes == TR::Size_in_bytes) {
+      writeRegion(Region.second, bitcast<BT, ElemTy, TR::length>(Val));
+    } else {
+      // Recursively read the base
+      auto Base = readRegion<Ty, N>(data(), Region.second);
+      // If element type differs, do bitcast conversion first.
+      auto Base1 = bitcast<ElemTy, BT, BN>(Base);
+      constexpr int BN1 = PaTy::Size_in_bytes / sizeof(ElemTy);
+
+      if constexpr (!TR::Is_2D) {
+        // Access the region information.
+        constexpr int M = TR::Size_x;
+        constexpr int Stride = TR::Stride_x;
+        uint16_t Offset = Region.first.M_offset_x * sizeof(ElemTy);
+
+        // Merge and update.
+        Base1 = __esimd_wrregion<ElemTy, BN1, M,
+                                 /*VS*/ 0, M, Stride>(Base1, Val, Offset);
+      } else {
+        static_assert(std::is_same<ElemTy, BT>::value);
+        // Read columns with non-trivial horizontal stride.
+        constexpr int M = TR::length;
+        constexpr int VS = PaTy::Size_x * TR::Stride_y;
+        constexpr int W = TR::Size_x;
+        constexpr int HS = TR::Stride_x;
+        constexpr int ParentWidth = PaTy::Size_x;
+
+        // Compute the byte offset for the starting element.
+        uint16_t Offset = static_cast<uint16_t>(
+            (Region.first.M_offset_y * PaTy::Size_x + Region.first.M_offset_x) *
+            sizeof(ElemTy));
+
+        // Merge and update.
+        Base1 = __esimd_wrregion<ElemTy, BN1, M, VS, W, HS, ParentWidth>(
+            Base1, Val, Offset);
+      }
+      // Convert back to the original element type, if needed.
+      auto Merged1 = bitcast<BT, ElemTy, BN1>(Base1);
+      // recursively write it back to the base
+      writeRegion(Region.second, Merged1);
+    }
+  }
+
+  /// @name Memory operations
+  /// TODO NOTE: These APIs do not support cache hint specification yet, as this
+  /// is WIP. Later addition of hints is not expected to break code using these
+  /// APIs.
+  ///
+  /// @{
+
+  /// Copy a contiguous block of data from memory into this simd_obj_impl
+  /// object. The amount of memory copied equals the total size of vector
+  /// elements in this object.
+  /// @param addr the memory address to copy from. Must be a pointer to the
+  /// global address space, otherwise behavior is undefined.
+  ESIMD_INLINE void copy_from(const Ty *const addr) SYCL_ESIMD_FUNCTION;
+
+  /// Copy a contiguous block of data from memory into this simd_obj_impl
+  /// object. The amount of memory copied equals the total size of vector
+  /// elements in this object. Source memory location is represented via a
+  /// global accessor and offset.
+  /// @param acc accessor to copy from.
+  /// @param offset offset to copy from.
+  template <typename AccessorT>
+  ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read,
+                                sycl::access::target::global_buffer, void>
+  copy_from(AccessorT acc, uint32_t offset) SYCL_ESIMD_FUNCTION;
+
+  /// Copy all vector elements of this object into a contiguous block in memory.
+  /// @param addr the memory address to copy to. Must be a pointer to the
+  /// global address space, otherwise behavior is undefined.
+  ESIMD_INLINE void copy_to(Ty *addr) SYCL_ESIMD_FUNCTION;
+
+  /// Copy all vector elements of this object into a contiguous block in memory.
+  /// Destination memory location is represented via a global accessor and
+  /// offset.
+  /// @param acc accessor to copy from.
+  /// @param offset offset to copy from.
+  template <typename AccessorT>
+  ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_write,
+                                sycl::access::target::global_buffer, void>
+  copy_to(AccessorT acc, uint32_t offset) SYCL_ESIMD_FUNCTION;
+
+  /// @} // Memory operations
+
+  /// Bitwise inversion, available in all subclasses.
+  template <class T1 = Ty, class = std::enable_if_t<std::is_integral_v<T1>>>
+  Derived operator~() {
+    return Derived(~data());
+  }
+
+  /// Unary logical negation operator, available in all subclasses.
+  template <class T1 = Ty, class = std::enable_if_t<std::is_integral_v<T1>>>
+  simd_mask_type<N> operator!() {
+    using MaskVecT = typename simd_mask_type<N>::vector_type;
+    auto R = data() == vector_type(0);
+    return simd_mask_type<N>{__builtin_convertvector(R, MaskVecT) &
+                             MaskVecT(1)};
+  }
+
+#define __ESIMD_DEF_SIMD_OBJ_IMPL_OPASSIGN(BINOP, OPASSIGN, COND)              \
+                                                                               \
+  /*  OPASSIGN simd_obj_impl */                                                \
+  template <class T1, class SimdT,                                             \
+            class = std::enable_if_t<(is_simd_type_v<Derived> ==               \
+                                      is_simd_type_v<SimdT>)&&COND>>           \
+  Derived &operator OPASSIGN(                                                  \
+      const __SEIEED::simd_obj_impl<T1, N, SimdT> &RHS) {                      \
+    auto Res = *this BINOP RHS;                                                \
+    set(__SEIEED::convert<vector_type>(Res.data()));                           \
+    return cast_this_to_derived();                                             \
+  }                                                                            \
+                                                                               \
+  /*  OPASSIGN simd_view */                                                    \
+  template <class SimdT1, class RegionT1,                                      \
+            class T1 = typename RegionT1::element_type,                        \
+            class = std::enable_if_t<                                          \
+                (is_simd_type_v<Derived> ==                                    \
+                 is_simd_type_v<SimdT1>)&&(RegionT1::length == length) &&      \
+                COND>>                                                         \
+  Derived &operator OPASSIGN(                                                  \
+      const __SEIEE::simd_view<SimdT1, RegionT1> &RHS) {                       \
+    auto Res = *this BINOP RHS.read();                                         \
+    set(__SEIEED::convert<vector_type>(Res.data()));                           \
+    return cast_this_to_derived();                                             \
+  }                                                                            \
+                                                                               \
+  /*  OPASSIGN SCALAR */                                                       \
+  template <class T1, class = std::enable_if_t<COND>>                          \
+  Derived &operator OPASSIGN(T1 RHS) {                                         \
+    if constexpr (is_simd_type_v<Derived>) {                                   \
+      using RHSVecT = __SEIEED::construct_a_simd_type_t<Derived, T1, N>;       \
+      return *this OPASSIGN RHSVecT(RHS);                                      \
+    } else {                                                                   \
+      return *this OPASSIGN Derived((Ty)RHS);                                  \
+    }                                                                          \
+  }
+
+// Bitwise operations are defined for simd objects and masks, and both operands
+// must be integral
+#define __ESIMD_BITWISE_OP_FILTER                                              \
+  std::is_integral_v<element_type> &&std::is_integral_v<T1>
+
+  __ESIMD_DEF_SIMD_OBJ_IMPL_OPASSIGN(^, ^=, __ESIMD_BITWISE_OP_FILTER)
+  __ESIMD_DEF_SIMD_OBJ_IMPL_OPASSIGN(|, |=, __ESIMD_BITWISE_OP_FILTER)
+  __ESIMD_DEF_SIMD_OBJ_IMPL_OPASSIGN(&, &=, __ESIMD_BITWISE_OP_FILTER)
+  __ESIMD_DEF_SIMD_OBJ_IMPL_OPASSIGN(%, %=, __ESIMD_BITWISE_OP_FILTER)
+#undef __ESIMD_BITWISE_OP_FILTER
+
+// Bit shift operations are defined only for simd objects (not for masks), and
+// both operands must be integral
+#define __ESIMD_SHIFT_OP_FILTER                                                \
+  std::is_integral_v<element_type> &&std::is_integral_v<T1>                    \
+      &&__SEIEED::is_simd_type_v<Derived>
+
+  __ESIMD_DEF_SIMD_OBJ_IMPL_OPASSIGN(<<, <<=, __ESIMD_SHIFT_OP_FILTER)
+  __ESIMD_DEF_SIMD_OBJ_IMPL_OPASSIGN(>>, >>=, __ESIMD_SHIFT_OP_FILTER)
+#undef __ESIMD_SHIFT_OP_FILTER
+
+// Arithmetic operations are defined only for simd objects, and the second
+// operand's element type must be vectorizable. This requirement for 'this'
+// is fulfilled, because otherwise 'this' couldn't have been constructed.
+#define __ESIMD_ARITH_OP_FILTER                                                \
+  __SEIEED::is_simd_type_v<Derived> &&__SEIEED::is_vectorizable_v<T1>
+
+  __ESIMD_DEF_SIMD_OBJ_IMPL_OPASSIGN(+, +=, __ESIMD_ARITH_OP_FILTER)
+  __ESIMD_DEF_SIMD_OBJ_IMPL_OPASSIGN(-, -=, __ESIMD_ARITH_OP_FILTER)
+  __ESIMD_DEF_SIMD_OBJ_IMPL_OPASSIGN(*, *=, __ESIMD_ARITH_OP_FILTER)
+  __ESIMD_DEF_SIMD_OBJ_IMPL_OPASSIGN(/, /=, __ESIMD_ARITH_OP_FILTER)
+#undef __ESIMD_ARITH_OP_FILTER
+
+private:
+  // The underlying data for this vector.
+  vector_type M_data;
+
+protected:
+  void set(const vector_type &Val) {
+#ifndef __SYCL_DEVICE_ONLY__
+    M_data = Val;
+#else
+    __esimd_vstore<Ty, N>(&M_data, Val);
+#endif
+  }
+};
+
+// ----------- Outlined implementations of simd_obj_impl class APIs.
+
+template <typename T, int N, class T1, class SFINAE>
+void simd_obj_impl<T, N, T1, SFINAE>::copy_from(const T *const Addr) {
+  constexpr unsigned Sz = sizeof(T) * N;
+  static_assert(Sz >= OperandSize::OWORD,
+                "block size must be at least 1 oword");
+  static_assert(Sz % OperandSize::OWORD == 0,
+                "block size must be whole number of owords");
+  static_assert(isPowerOf2(Sz / OperandSize::OWORD),
+                "block must be 1, 2, 4 or 8 owords long");
+  static_assert(Sz <= 8 * OperandSize::OWORD,
+                "block size must be at most 8 owords");
+
+  uintptr_t AddrVal = reinterpret_cast<uintptr_t>(Addr);
+  *this =
+      __esimd_flat_block_read_unaligned<T, N, CacheHint::None, CacheHint::None>(
+          AddrVal);
+}
+
+template <typename T, int N, class T1, class SFINAE>
+template <typename AccessorT>
+ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read,
+                              sycl::access::target::global_buffer, void>
+simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset) {
+  constexpr unsigned Sz = sizeof(T) * N;
+  static_assert(Sz >= OperandSize::OWORD,
+                "block size must be at least 1 oword");
+  static_assert(Sz % OperandSize::OWORD == 0,
+                "block size must be whole number of owords");
+  static_assert(isPowerOf2(Sz / OperandSize::OWORD),
+                "block must be 1, 2, 4 or 8 owords long");
+  static_assert(Sz <= 8 * OperandSize::OWORD,
+                "block size must be at most 8 owords");
+#if defined(__SYCL_DEVICE_ONLY__)
+  auto surf_ind = AccessorPrivateProxy::getNativeImageObj(acc);
+  *this = __esimd_block_read<T, N>(surf_ind, offset);
+#else
+  *this = __esimd_block_read<T, N>(acc, offset);
+#endif // __SYCL_DEVICE_ONLY__
+}
+
+template <typename T, int N, class T1, class SFINAE>
+void simd_obj_impl<T, N, T1, SFINAE>::copy_to(T *addr) {
+  constexpr unsigned Sz = sizeof(T) * N;
+  static_assert(Sz >= OperandSize::OWORD,
+                "block size must be at least 1 oword");
+  static_assert(Sz % OperandSize::OWORD == 0,
+                "block size must be whole number of owords");
+  static_assert(isPowerOf2(Sz / OperandSize::OWORD),
+                "block must be 1, 2, 4 or 8 owords long");
+  static_assert(Sz <= 8 * OperandSize::OWORD,
+                "block size must be at most 8 owords");
+
+  uintptr_t AddrVal = reinterpret_cast<uintptr_t>(addr);
+  __esimd_flat_block_write<T, N, CacheHint::None, CacheHint::None>(AddrVal,
+                                                                   data());
+}
+
+template <typename T, int N, class T1, class SFINAE>
+template <typename AccessorT>
+ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_write,
+                              sycl::access::target::global_buffer, void>
+simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset) {
+  constexpr unsigned Sz = sizeof(T) * N;
+  static_assert(Sz >= OperandSize::OWORD,
+                "block size must be at least 1 oword");
+  static_assert(Sz % OperandSize::OWORD == 0,
+                "block size must be whole number of owords");
+  static_assert(isPowerOf2(Sz / OperandSize::OWORD),
+                "block must be 1, 2, 4 or 8 owords long");
+  static_assert(Sz <= 8 * OperandSize::OWORD,
+                "block size must be at most 8 owords");
+
+#if defined(__SYCL_DEVICE_ONLY__)
+  auto surf_ind = AccessorPrivateProxy::getNativeImageObj(acc);
+  __esimd_block_write<T, N>(surf_ind, offset >> 4, data());
+#else
+  __esimd_block_write<T, N>(acc, offset >> 4, data());
+#endif // __SYCL_DEVICE_ONLY__
+}
+} // namespace detail
+
+} // namespace esimd
+} // namespace experimental
+} // namespace intel
+} // namespace ext
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp
index 56c4e626118b8..72f9b9381ad70 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp
@@ -25,23 +25,22 @@ namespace detail {
 /// It is an internal class implementing basic functionality of simd_view.
 ///
 /// \ingroup sycl_esimd
-template <typename BaseTy, typename RegionTy, typename Derived>
-class simd_view_impl {
+template <typename BaseTy, typename RegionTy> class simd_view_impl {
+  using Derived = simd_view<BaseTy, RegionTy>;
+  template <typename, int, class, class> friend class simd_obj_impl;
   template <typename, int> friend class simd;
-  template <typename, typename, typename> friend class simd_view_impl;
+  template <typename, typename> friend class simd_view_impl;
+  template <typename, int> friend class simd_mask_impl;
 
 public:
-  static_assert(!detail::is_simd_view_v<BaseTy>::value);
+  static_assert(is_simd_obj_impl_derivative_v<BaseTy>);
   // Deduce the corresponding value type from its region type.
   using ShapeTy = typename shape_type<RegionTy>::type;
   static constexpr int length = ShapeTy::Size_x * ShapeTy::Size_y;
 
-  /// The simd type if reading the object.
-  using value_type = simd<typename ShapeTy::element_type, length>;
-
-  /// The underlying builtin value type
-  using vector_type =
-      detail::vector_type_t<typename ShapeTy::element_type, length>;
+  using base_type = BaseTy;
+  template <typename ElT, int N>
+  using get_simd_t = construct_a_simd_type_t<base_type, ElT, N>;
 
   /// The region type of this class.
   using region_type = RegionTy;
@@ -50,43 +49,42 @@ class simd_view_impl {
   /// type of the base object type.
   using element_type = typename ShapeTy::element_type;
 
-  /// @{
-  /// Constructors.
+  /// The simd type if reading the object.
+  using value_type = get_simd_t<element_type, length>;
+
+  /// The underlying builtin vector type backing the value read from the object.
+  using vector_type = vector_type_t<element_type, length>;
 
 private:
   Derived &cast_this_to_derived() { return reinterpret_cast<Derived &>(*this); }
 
 protected:
+  /// @{
+  /// Constructors.
   simd_view_impl(BaseTy &Base, RegionTy Region)
       : M_base(Base), M_region(Region) {}
   simd_view_impl(BaseTy &&Base, RegionTy Region)
       : M_base(Base), M_region(Region) {}
-
+  /// @}
 public:
   // Default copy and move constructors.
   simd_view_impl(const simd_view_impl &Other) = default;
   simd_view_impl(simd_view_impl &&Other) = default;
-  /// @}
 
-  /// Conversion to simd type.
-  template <typename ToTy> operator simd<ToTy, length>() const {
-    if constexpr (std::is_same<element_type, ToTy>::value)
+  /// Implicit conversion to simd type.
+  template <typename ToTy, class T = BaseTy,
+            class = std::enable_if_t<is_simd_type_v<T>>>
+  inline operator simd<ToTy, length>() const {
+    if constexpr (std::is_same_v<element_type, ToTy>)
       return read();
     else
       return convert<ToTy, element_type, length>(read());
   }
 
-  /// @{
-  /// Assignment operators.
-  simd_view_impl &operator=(const simd_view_impl &Other) {
-    return write(Other.read());
-  }
-  simd_view_impl &operator=(const value_type &Val) { return write(Val); }
-  /// @}
-
-  /// Move assignment operator.
-  simd_view_impl &operator=(simd_view_impl &&Other) {
-    return write(Other.read());
+  /// Implicit conversion to simd_mask_impl type, if element type is compatible.
+  template <class T = BaseTy, class = std::enable_if_t<is_simd_mask_type_v<T>>>
+  inline operator simd_mask_type<length>() const {
+    return read();
   }
 
   /// @{
@@ -97,9 +95,11 @@ class simd_view_impl {
   static constexpr int getStrideX() { return ShapeTy::Stride_x; }
   static constexpr int getSizeY() { return ShapeTy::Size_y; }
   static constexpr int getStrideY() { return ShapeTy::Stride_y; }
+
   constexpr uint16_t getOffsetX() const {
     return getTopRegion(M_region).M_offset_x;
   }
+
   constexpr uint16_t getOffsetY() const {
     return getTopRegion(M_region).M_offset_y;
   }
@@ -109,9 +109,11 @@ class simd_view_impl {
   value_type read() const {
     using BT = typename BaseTy::element_type;
     constexpr int BN = BaseTy::length;
-    return detail::readRegion<BT, BN>(M_base.data(), M_region);
+    return value_type{readRegion<BT, BN>(M_base.data(), M_region)};
   }
 
+  typename value_type::vector_type data() const { return read().data(); }
+
   /// Write to this object.
   Derived &write(const value_type &Val) {
     M_base.writeRegion(M_region, Val.data());
@@ -120,12 +122,12 @@ class simd_view_impl {
 
   /// @{
   /// Whole region update with predicates.
-  void merge(const value_type &Val, const mask_type_t<length> &Mask) {
+  void merge(const value_type &Val, const simd_mask_type<length> &Mask) {
     merge(Val, read(), Mask);
   }
 
   void merge(const value_type &Val1, value_type Val2,
-             const mask_type_t<length> &Mask) {
+             const simd_mask_type<length> &Mask) {
     Val2.merge(Val1, Mask);
     write(Val2.read());
   }
@@ -200,84 +202,139 @@ class simd_view_impl {
     TopRegionTy TopReg(OffsetY, OffsetX);
     return RetTy{this->M_base, std::make_pair(TopReg, M_region)};
   }
-
-#define DEF_BINOP(BINOP, OPASSIGN)                                             \
-  ESIMD_INLINE friend auto operator BINOP(const Derived &X,                    \
-                                          const Derived &Y) {                  \
-    return (X BINOP Y.read());                                                 \
+#define __ESIMD_DEF_SIMD_VIEW_IMPL_OPASSIGN(BINOP, OPASSIGN, COND)             \
+                                                                               \
+  /* OPASSIGN simd_obj_impl */                                                 \
+  template <class T1, int N1, class SimdT1, class T = element_type,            \
+            class SimdT = BaseTy,                                              \
+            class =                                                            \
+                std::enable_if_t<(is_simd_type_v<SimdT> ==                     \
+                                  is_simd_type_v<SimdT1>)&&(N1 == length) &&   \
+                                 COND>>                                        \
+  Derived &operator OPASSIGN(const simd_obj_impl<T1, N1, SimdT1> &RHS) {       \
+    auto Res = read() BINOP RHS;                                               \
+    write(Res);                                                                \
+    return cast_this_to_derived();                                             \
   }                                                                            \
-  Derived &operator OPASSIGN(const value_type &RHS) {                          \
-    using ComputeTy = detail::compute_type_t<value_type>;                      \
-    auto V0 = detail::convert<typename ComputeTy::vector_type>(read().data()); \
-    auto V1 = detail::convert<typename ComputeTy::vector_type>(RHS.data());    \
-    auto V2 = V0 BINOP V1;                                                     \
-    auto V3 = detail::convert<vector_type>(V2);                                \
-    write(V3);                                                                 \
+                                                                               \
+  /* OPASSIGN simd_view_impl */                                                \
+  template <class SimdT1, class RegionT1,                                      \
+            class T1 = typename __SEIEE::shape_type<RegionT1>::element_type,   \
+            class T = element_type, class SimdT = BaseTy,                      \
+            class = std::enable_if_t<                                          \
+                (is_simd_type_v<SimdT> == is_simd_type_v<SimdT1>)&&(           \
+                    length == __SEIEE::shape_type<RegionT1>::length) &&        \
+                COND>>                                                         \
+  Derived &operator OPASSIGN(const simd_view_impl<SimdT1, RegionT1> &RHS) {    \
+    *this OPASSIGN RHS.read();                                                 \
     return cast_this_to_derived();                                             \
   }                                                                            \
-  Derived &operator OPASSIGN(const Derived &RHS) {                             \
-    return (*this OPASSIGN RHS.read());                                        \
+                                                                               \
+  /* OPASSIGN scalar */                                                        \
+  template <class T1, class T = element_type, class SimdT = BaseTy,            \
+            class = std::enable_if_t<COND>>                                    \
+  Derived &operator OPASSIGN(T1 RHS) {                                         \
+    auto Res = read() BINOP RHS;                                               \
+    write(Res);                                                                \
+    return cast_this_to_derived();                                             \
   }
 
-  DEF_BINOP(+, +=)
-  DEF_BINOP(-, -=)
-  DEF_BINOP(*, *=)
-  DEF_BINOP(/, /=)
-  DEF_BINOP(%, %=)
+#define __ESIMD_BITWISE_OP_FILTER std::is_integral_v<T> &&std::is_integral_v<T1>
+  __ESIMD_DEF_SIMD_VIEW_IMPL_OPASSIGN(^, ^=, __ESIMD_BITWISE_OP_FILTER)
+  __ESIMD_DEF_SIMD_VIEW_IMPL_OPASSIGN(|, |=, __ESIMD_BITWISE_OP_FILTER)
+  __ESIMD_DEF_SIMD_VIEW_IMPL_OPASSIGN(&, &=, __ESIMD_BITWISE_OP_FILTER)
+  __ESIMD_DEF_SIMD_VIEW_IMPL_OPASSIGN(%, %=, __ESIMD_BITWISE_OP_FILTER)
+#undef __ESIMD_BITWISE_OP_FILTER
 
-#undef DEF_BINOP
+#define __ESIMD_SHIFT_OP_FILTER                                                \
+  std::is_integral_v<T> &&std::is_integral_v<T1> &&is_simd_type_v<SimdT>
 
-#define DEF_BITWISE_OP(BITWISE_OP, OPASSIGN)                                   \
-  ESIMD_INLINE friend auto operator BITWISE_OP(const Derived &X,               \
-                                               const Derived &Y) {             \
-    return (X BITWISE_OP Y.read());                                            \
-  }                                                                            \
-  Derived &operator OPASSIGN(const value_type &RHS) {                          \
-    static_assert(std::is_integral<element_type>(), "not integeral type");     \
-    auto V2 = read().data() BITWISE_OP RHS.data();                             \
-    auto V3 = detail::convert<vector_type>(V2);                                \
-    write(V3);                                                                 \
-    return cast_this_to_derived();                                             \
-  }                                                                            \
-  Derived &operator OPASSIGN(const Derived &RHS) {                             \
-    return (*this OPASSIGN RHS.read());                                        \
-  }
-  DEF_BITWISE_OP(&, &=)
-  DEF_BITWISE_OP(|, |=)
-  DEF_BITWISE_OP(^, ^=)
-  DEF_BITWISE_OP(>>, >>=)
-  DEF_BITWISE_OP(<<, <<=)
+  __ESIMD_DEF_SIMD_VIEW_IMPL_OPASSIGN(<<, <<=, __ESIMD_SHIFT_OP_FILTER)
+  __ESIMD_DEF_SIMD_VIEW_IMPL_OPASSIGN(>>, >>=, __ESIMD_SHIFT_OP_FILTER)
+#undef __ESIMD_SHIFT_OP_FILTER
+
+#define __ESIMD_ARITH_OP_FILTER                                                \
+  is_vectorizable_v<T> &&is_vectorizable_v<T1> &&is_simd_type_v<SimdT>
 
-#undef DEF_BITWISE_OP
+  __ESIMD_DEF_SIMD_VIEW_IMPL_OPASSIGN(+, +=, __ESIMD_ARITH_OP_FILTER)
+  __ESIMD_DEF_SIMD_VIEW_IMPL_OPASSIGN(-, -=, __ESIMD_ARITH_OP_FILTER)
+  __ESIMD_DEF_SIMD_VIEW_IMPL_OPASSIGN(*, *=, __ESIMD_ARITH_OP_FILTER)
+  __ESIMD_DEF_SIMD_VIEW_IMPL_OPASSIGN(/, /=, __ESIMD_ARITH_OP_FILTER)
 
-#define DEF_UNARY_OP(UNARY_OP)                                                 \
+#undef __ESIMD_ARITH_OP_FILTER
+#undef __ESIMD_DEF_SIMD_VIEW_IMPL_OPASSIGN
+
+#define __ESIMD_DEF_UNARY_OP(UNARY_OP, COND)                                   \
+  template <class T = element_type, class SimdT = BaseTy,                      \
+            class = std::enable_if_t<COND>>                                    \
   auto operator UNARY_OP() {                                                   \
     auto V = UNARY_OP(read().data());                                          \
-    return simd<element_type, length>(V);                                      \
+    return get_simd_t<element_type, length>(V);                                \
+  }
+  __ESIMD_DEF_UNARY_OP(~, std::is_integral_v<T> &&is_simd_type_v<SimdT>)
+  __ESIMD_DEF_UNARY_OP(+, is_simd_type_v<SimdT>)
+  __ESIMD_DEF_UNARY_OP(-, is_simd_type_v<SimdT>)
+
+#undef __ESIMD_DEF_UNARY_OP
+
+  /// Unary logical negeation operator. Applies only to integer element types.
+  template <class T = element_type,
+            class = std::enable_if_t<std::is_integral_v<T>>>
+  auto operator!() {
+    using MaskVecT = typename simd_mask_type<length>::vector_type;
+    auto V = read().data() == 0;
+    return simd_mask_type<length>{__builtin_convertvector(V, MaskVecT) &
+                                  MaskVecT(1)};
+  }
+
+  /// @{
+  /// Assignment operators.
+  simd_view_impl &operator=(const simd_view_impl &Other) {
+    return write(Other.read());
   }
-  DEF_UNARY_OP(~)
-  DEF_UNARY_OP(+)
-  DEF_UNARY_OP(-)
 
-#undef DEF_UNARY_OP
+  Derived &operator=(const Derived &Other) { return write(Other.read()); }
 
-  // negation operator
-  auto operator!() { return cast_this_to_derived() == 0; }
+  Derived &operator=(const value_type &Val) { return write(Val); }
+
+  /// Move assignment operator.
+  Derived &operator=(Derived &&Other) { return write(Other.read()); }
+  simd_view_impl &operator=(simd_view_impl &&Other) {
+    return write(Other.read());
+  }
+
+  template <class T, int N, class SimdT,
+            class = std::enable_if_t<(is_simd_type_v<SimdT> ==
+                                      is_simd_type_v<BaseTy>)&&(length ==
+                                                                SimdT::length)>>
+  Derived &operator=(const simd_obj_impl<T, N, SimdT> &Other) {
+    return write(convert<element_type>(reinterpret_cast<const SimdT &>(Other)));
+  }
+
+  template <class T1, class = std::enable_if_t<detail::is_vectorizable_v<T1>>>
+  Derived &operator=(T1 RHS) {
+    return write(value_type((element_type)RHS));
+  }
+
+  /// @}
 
   // Operator ++, --
   Derived &operator++() {
     *this += 1;
     return cast_this_to_derived();
   }
+
   value_type operator++(int) {
     value_type Ret(read());
     operator++();
     return Ret;
   }
+
   Derived &operator--() {
     *this -= 1;
     return cast_this_to_derived();
   }
+
   value_type operator--(int) {
     value_type Ret(read());
     operator--();
@@ -289,7 +346,7 @@ class simd_view_impl {
   template <typename T = Derived,
             typename = sycl::detail::enable_if_t<T::is2D()>>
   auto row(int i) {
-    return select<1, 0, getSizeX(), 1>(i, 0)
+    return select<1, 1, getSizeX(), 1>(i, 0)
         .template bit_cast_view<element_type>();
   }
 
@@ -298,7 +355,7 @@ class simd_view_impl {
   template <typename T = Derived,
             typename = sycl::detail::enable_if_t<T::is2D()>>
   auto column(int i) {
-    return select<getSizeY(), 1, 1, 0>(0, i);
+    return select<getSizeY(), 1, 1, 1>(0, i);
   }
 
   /// Read a single element from a 1D region, by value only.
@@ -322,7 +379,7 @@ class simd_view_impl {
   template <typename T = Derived,
             typename = sycl::detail::enable_if_t<T::is1D()>>
   auto operator[](int i) {
-    return select<1, 0>(i);
+    return select<1, 1>(i);
   }
 
   /// Return a writeable view of a single element.
@@ -330,16 +387,16 @@ class simd_view_impl {
             typename = sycl::detail::enable_if_t<T::is1D()>>
   __SYCL_DEPRECATED("use operator[] form.")
   auto operator()(int i) {
-    return select<1, 0>(i);
+    return select<1, 1>(i);
   }
 
   /// \name Replicate
-  /// Replicate simd instance given a simd_view
+  /// Replicate simd instance given a simd_view_impl
   /// @{
   ///
 
   /// \tparam Rep is number of times region has to be replicated.
-  template <int Rep> simd<element_type, Rep> replicate() {
+  template <int Rep> get_simd_t<element_type, Rep> replicate() {
     return read().replicate<Rep>(0);
   }
 
@@ -348,7 +405,7 @@ class simd_view_impl {
   /// \param OffsetX is column offset in number of elements in src region.
   /// \return replicated simd instance.
   template <int Rep, int W>
-  simd<element_type, Rep * W> replicate(uint16_t OffsetX) {
+  get_simd_t<element_type, Rep * W> replicate(uint16_t OffsetX) {
     return replicate<Rep, 0, W>(0, OffsetX);
   }
 
@@ -358,7 +415,8 @@ class simd_view_impl {
   /// \param OffsetY is row offset in number of elements in src region.
   /// \return replicated simd instance.
   template <int Rep, int W>
-  simd<element_type, Rep * W> replicate(uint16_t OffsetY, uint16_t OffsetX) {
+  get_simd_t<element_type, Rep * W> replicate(uint16_t OffsetY,
+                                              uint16_t OffsetX) {
     return replicate<Rep, 0, W>(OffsetY, OffsetX);
   }
 
@@ -368,7 +426,7 @@ class simd_view_impl {
   /// \param OffsetX is column offset in number of elements in src region.
   /// \return replicated simd instance.
   template <int Rep, int VS, int W>
-  simd<element_type, Rep * W> replicate(uint16_t OffsetX) {
+  get_simd_t<element_type, Rep * W> replicate(uint16_t OffsetX) {
     return replicate<Rep, VS, W, 1>(0, OffsetX);
   }
 
@@ -379,7 +437,8 @@ class simd_view_impl {
   /// \param OffsetY is row offset in number of elements in src region.
   /// \return replicated simd instance.
   template <int Rep, int VS, int W>
-  simd<element_type, Rep * W> replicate(uint16_t OffsetY, uint16_t OffsetX) {
+  get_simd_t<element_type, Rep * W> replicate(uint16_t OffsetY,
+                                              uint16_t OffsetX) {
     return replicate<Rep, VS, W, 1>(OffsetY, OffsetX);
   }
 
@@ -390,7 +449,7 @@ class simd_view_impl {
   /// \param OffsetX is column offset in number of elements in src region.
   /// \return replicated simd instance.
   template <int Rep, int VS, int W, int HS>
-  simd<element_type, Rep * W> replicate(uint16_t OffsetX) {
+  get_simd_t<element_type, Rep * W> replicate(uint16_t OffsetX) {
     return read().template replicate<Rep, VS, W, HS>(OffsetX);
   }
 
@@ -402,29 +461,28 @@ class simd_view_impl {
   /// \param OffsetY is row offset in number of elements in src region.
   /// \return replicated simd instance.
   template <int Rep, int VS, int W, int HS>
-  simd<element_type, Rep * W> replicate(uint16_t OffsetY, uint16_t OffsetX) {
+  get_simd_t<element_type, Rep * W> replicate(uint16_t OffsetY,
+                                              uint16_t OffsetX) {
     constexpr int RowSize = is2D() ? getSizeX() : 0;
     return read().template replicate<Rep, VS, W, HS>(OffsetY * RowSize +
                                                      OffsetX);
   }
   /// @}
 
-  /// Any operation.
+  /// 'any' operation.
   ///
   /// \return 1 if any element is set, 0 otherwise.
-  template <
-      typename T1 = element_type, typename T2 = BaseTy,
-      typename = sycl::detail::enable_if_t<std::is_integral<T1>::value, T2>>
+  template <typename T1 = element_type, typename T2 = BaseTy,
+            typename = std::enable_if_t<std::is_integral<T1>::value, T2>>
   uint16_t any() {
     return read().any();
   }
 
-  /// All operation.
+  /// 'all' operation.
   ///
   /// \return 1 if all elements are set, 0 otherwise.
-  template <
-      typename T1 = element_type, typename T2 = BaseTy,
-      typename = sycl::detail::enable_if_t<std::is_integral<T1>::value, T2>>
+  template <typename T1 = element_type, typename T2 = BaseTy,
+            typename = std::enable_if_t<std::is_integral<T1>::value, T2>>
   uint16_t all() {
     return read().all();
   }
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/types.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/types.hpp
index e14d2e9c5306e..2746886c8253b 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/types.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/types.hpp
@@ -16,6 +16,12 @@
 #include <sycl/ext/intel/experimental/esimd/common.hpp>
 #include <sycl/ext/intel/experimental/esimd/detail/region.hpp>
 
+#if defined(__ESIMD_DBG_HOST) && !defined(__SYCL_DEVICE_ONLY__)
+#define __esimd_dbg_print(a) std::cout << ">>> " << #a << "\n"
+#else
+#define __esimd_dbg_print(a)
+#endif // defined(__ESIMD_DBG_HOST) && !defined(__SYCL_DEVICE_ONLY__)
+
 #include <cstdint>
 
 __SYCL_INLINE_NAMESPACE(cl) {
@@ -25,14 +31,139 @@ namespace intel {
 namespace experimental {
 namespace esimd {
 
-// simd and simd_view forward declarations
+// simd and simd_view_impl forward declarations
 template <typename Ty, int N> class simd;
 template <typename BaseTy, typename RegionTy> class simd_view;
 
 namespace detail {
 
-namespace csd = cl::sycl::detail;
+// forward declarations of major internal simd classes
+template <typename Ty, int N> class simd_mask_impl;
+template <typename ElT, int N, class Derived, class SFINAE = void>
+class simd_obj_impl;
+
+// @{
+// Helpers for major simd classes, which don't require their definitions to
+// compile. Error checking/SFINAE is not used as these are only used internally.
+
+using simd_mask_elem_type = unsigned short;
+template <int N> using simd_mask_type = simd_mask_impl<simd_mask_elem_type, N>;
+
+// @{
+// Checks if given type T is a raw clang vector type, plus provides some info
+// about it if it is.
+
+struct invalid_element_type;
+
+template <class T> struct is_clang_vector_type : std::false_type {
+  static inline constexpr int length = 0;
+  using element_type = invalid_element_type;
+};
+
+template <class T, int N>
+struct is_clang_vector_type<T __attribute__((ext_vector_type(N)))>
+    : std::true_type {
+  static inline constexpr int length = N;
+  using element_type = T;
+};
+template <class T>
+static inline constexpr bool is_clang_vector_type_v =
+    is_clang_vector_type<T>::value;
+
+// @}
+
+// @{
+// Checks if given type T derives from simd_obj_impl or is equal to it.
+template <typename T>
+struct is_simd_obj_impl_derivative : public std::false_type {
+  using element_type = invalid_element_type;
+};
+
+// Specialization for the simd_obj_impl type itself.
+template <typename ElT, int N, class Derived>
+struct is_simd_obj_impl_derivative<simd_obj_impl<ElT, N, Derived>>
+    : public std::true_type {
+  using element_type = ElT;
+};
+
+// Specialization for all other types.
+template <typename ElT, int N, template <typename, int> class Derived>
+struct is_simd_obj_impl_derivative<Derived<ElT, N>>
+    : public std::conditional_t<
+          std::is_base_of_v<simd_obj_impl<ElT, N, Derived<ElT, N>>,
+                            Derived<ElT, N>>,
+          std::true_type, std::false_type> {
+  using element_type = std::conditional_t<
+      std::is_base_of_v<simd_obj_impl<ElT, N, Derived<ElT, N>>,
+                        Derived<ElT, N>>,
+      ElT, void>;
+};
+
+// Convenience shortcut.
+template <typename T>
+inline constexpr bool is_simd_obj_impl_derivative_v =
+    is_simd_obj_impl_derivative<T>::value;
+// @}
+
+// @{
+// "Resizes" given simd type \c T to given number of elements \c N.
+template <class SimdT, int Ndst> struct resize_a_simd_type;
+
+// Specialization for the simd_obj_impl type.
+template <typename ElT, int Nsrc, int Ndst,
+          template <typename, int> class SimdT>
+struct resize_a_simd_type<simd_obj_impl<ElT, Nsrc, SimdT<ElT, Nsrc>>, Ndst> {
+  using type = simd_obj_impl<ElT, Ndst, SimdT<ElT, Ndst>>;
+};
+
+// Specialization for the simd_obj_impl type derivatives.
+template <typename ElT, int Nsrc, int Ndst,
+          template <typename, int> class SimdT>
+struct resize_a_simd_type<SimdT<ElT, Nsrc>, Ndst> {
+  using type = SimdT<ElT, Ndst>;
+};
+
+// Convenience shortcut.
+template <class SimdT, int Ndst>
+using resize_a_simd_type_t = typename resize_a_simd_type<SimdT, Ndst>::type;
+// @}
+
+// @{
+// Converts element type of given simd type \c SimdT to
+// given scalar type \c DstElemT.
+template <class SimdT, typename DstElemT> struct convert_simd_elem_type;
+
+// Specialization for the simd_obj_impl type.
+template <typename SrcElemT, int N, typename DstElemT,
+          template <typename, int> class SimdT>
+struct convert_simd_elem_type<simd_obj_impl<SrcElemT, N, SimdT<SrcElemT, N>>,
+                              DstElemT> {
+  using type = simd_obj_impl<DstElemT, N, SimdT<DstElemT, N>>;
+};
+
+// Specialization for the simd_obj_impl type derivatives.
+template <typename SrcElemT, int N, typename DstElemT,
+          template <typename, int> class SimdT>
+struct convert_simd_elem_type<SimdT<SrcElemT, N>, DstElemT> {
+  using type = SimdT<DstElemT, N>;
+};
+
+// Convenience shortcut.
+template <class SimdT, typename DstElemT>
+using convert_simd_elem_type_t =
+    typename convert_simd_elem_type<SimdT, DstElemT>::type;
+
+// @}
+
+// Constructs a simd type with the same template type as in \c SimdT, and
+// given element type and number.
+template <class SimdT, typename ElT, int N>
+using construct_a_simd_type_t =
+    convert_simd_elem_type_t<resize_a_simd_type_t<SimdT, N>, ElT>;
 
+// @}
+
+namespace csd = cl::sycl::detail;
 using half = cl::sycl::detail::half_impl::StorageT;
 
 template <typename T>
@@ -52,22 +183,25 @@ struct is_esimd_arithmetic_type<
                        decltype(std::declval<Ty>() - std::declval<Ty>()),
                        decltype(std::declval<Ty>() * std::declval<Ty>()),
                        decltype(std::declval<Ty>() / std::declval<Ty>())>>
-    : std::true_type {};
+    : std::conditional_t<std::is_arithmetic_v<Ty>, std::true_type,
+                         std::false_type> {};
 
-// is_vectorizable_type
 template <typename Ty>
-struct is_vectorizable : public is_esimd_arithmetic_type<Ty> {};
+static inline constexpr bool is_esimd_arithmetic_type_v =
+    is_esimd_arithmetic_type<Ty>::value;
 
-template <> struct is_vectorizable<bool> : public std::false_type {};
+// is_vectorizable_type
+template <typename Ty>
+struct is_vectorizable : std::conditional_t<is_esimd_arithmetic_type_v<Ty>,
+                                            std::true_type, std::false_type> {};
 
 template <typename Ty>
-struct is_vectorizable_v
-    : std::integral_constant<bool, is_vectorizable<Ty>::value> {};
+static inline constexpr bool is_vectorizable_v = is_vectorizable<Ty>::value;
 
 // vector_type, using clang vector type extension.
 template <typename Ty, int N> struct vector_type {
   static_assert(!std::is_const<Ty>::value, "const element type not supported");
-  static_assert(is_vectorizable_v<Ty>::value, "element type not supported");
+  static_assert(is_vectorizable_v<Ty>, "element type not supported");
   static_assert(N > 0, "zero-element vector not supported");
 
   static constexpr int length = N;
@@ -77,16 +211,28 @@ template <typename Ty, int N> struct vector_type {
 template <typename Ty, int N>
 using vector_type_t = typename vector_type<Ty, N>::type;
 
+// must match simd_mask<N>::element_type
+template <int N>
+using simd_mask_storage_t = vector_type_t<simd_mask_elem_type, N>;
+
 // Compute the simd_view type of a 1D format operation.
 template <typename BaseTy, typename EltTy> struct compute_format_type;
 
-template <typename Ty, int N, typename EltTy>
-struct compute_format_type<simd<Ty, N>, EltTy> {
+template <typename Ty, int N, typename EltTy> struct compute_format_type_impl {
   static constexpr int Size = sizeof(Ty) * N / sizeof(EltTy);
   static constexpr int Stride = 1;
   using type = region1d_t<EltTy, Size, Stride>;
 };
 
+template <typename Ty, int N, typename EltTy,
+          template <typename, int> class SimdT>
+struct compute_format_type<SimdT<Ty, N>, EltTy>
+    : compute_format_type_impl<Ty, N, EltTy> {};
+
+template <typename Ty, int N, typename EltTy, class SimdT>
+struct compute_format_type<simd_obj_impl<Ty, N, SimdT>, EltTy>
+    : compute_format_type_impl<Ty, N, EltTy> {};
+
 template <typename BaseTy, typename RegionTy, typename EltTy>
 struct compute_format_type<simd_view<BaseTy, RegionTy>, EltTy> {
   using ShapeTy = typename shape_type<RegionTy>::type;
@@ -103,7 +249,7 @@ template <typename BaseTy, typename EltTy, int Height, int Width>
 struct compute_format_type_2d;
 
 template <typename Ty, int N, typename EltTy, int Height, int Width>
-struct compute_format_type_2d<simd<Ty, N>, EltTy, Height, Width> {
+struct compute_format_type_2d_impl {
   static constexpr int Prod = sizeof(Ty) * N / sizeof(EltTy);
   static_assert(Prod == Width * Height, "size mismatch");
 
@@ -114,6 +260,16 @@ struct compute_format_type_2d<simd<Ty, N>, EltTy, Height, Width> {
   using type = region2d_t<EltTy, SizeY, StrideY, SizeX, StrideX>;
 };
 
+template <typename Ty, int N, typename EltTy, int Height, int Width,
+          template <typename, int> class SimdT>
+struct compute_format_type_2d<SimdT<Ty, N>, EltTy, Height, Width>
+    : compute_format_type_2d_impl<Ty, N, EltTy, Height, Width> {};
+
+template <typename Ty, int N, typename EltTy, int Height, int Width,
+          class SimdT>
+struct compute_format_type_2d<simd_obj_impl<Ty, N, SimdT>, EltTy, Height, Width>
+    : compute_format_type_2d_impl<Ty, N, EltTy, Height, Width> {};
+
 template <typename BaseTy, typename RegionTy, typename EltTy, int Height,
           int Width>
 struct compute_format_type_2d<simd_view<BaseTy, RegionTy>, EltTy, Height,
@@ -133,49 +289,120 @@ template <typename Ty, typename EltTy, int Height, int Width>
 using compute_format_type_2d_t =
     typename compute_format_type_2d<Ty, EltTy, Height, Width>::type;
 
-// Check if a type is simd_view type
-template <typename Ty> struct is_simd_view_type : std::false_type {};
+// @{
+// Checks if given type is a view of any simd type (simd or simd_mask).
+template <typename Ty> struct is_any_simd_view_type : std::false_type {};
 
 template <typename BaseTy, typename RegionTy>
-struct is_simd_view_type<simd_view<BaseTy, RegionTy>> : std::true_type {};
+struct is_any_simd_view_type<simd_view<BaseTy, RegionTy>> : std::true_type {};
+
+template <typename Ty>
+static inline constexpr bool is_any_simd_view_type_v =
+    is_any_simd_view_type<Ty>::value;
+// @}
+
+// @{
+// Check if a type is one of internal 'simd_xxx_impl' types exposing simd-like
+// interfaces and behaving like a simd object type.
 
 template <typename Ty>
-struct is_simd_view_v
-    : std::integral_constant<bool,
-                             is_simd_view_type<remove_cvref_t<Ty>>::value> {};
+static inline constexpr bool is_simd_like_type_v =
+    is_any_simd_view_type_v<Ty> || is_simd_obj_impl_derivative_v<Ty>;
+// @}
 
-// Check if a type is simd or simd_view type
+// @{
+// Checks if given type is a any of the user-visible simd types (simd or
+// simd_mask).
 template <typename Ty> struct is_simd_type : std::false_type {};
+template <typename ElTy, int N>
+struct is_simd_type<simd<ElTy, N>> : std::true_type {};
+template <typename Ty>
+static inline constexpr bool is_simd_type_v = is_simd_type<Ty>::value;
 
-template <typename Ty, int N>
-struct is_simd_type<simd<Ty, N>> : std::true_type {};
+template <typename Ty> struct is_simd_mask_type : std::false_type {};
+template <int N>
+struct is_simd_mask_type<simd_mask_impl<simd_mask_elem_type, N>>
+    : std::true_type {};
+template <typename Ty>
+static inline constexpr bool is_simd_mask_type_v = is_simd_mask_type<Ty>::value;
+// @}
 
-template <typename BaseTy, typename RegionTy>
-struct is_simd_type<simd_view<BaseTy, RegionTy>> : std::true_type {};
+// @{
+// Checks if given type is a view of the simd type.
+template <typename Ty> struct is_simd_view_type_impl : std::false_type {};
+
+template <class BaseT, class RegionT>
+struct is_simd_view_type_impl<simd_view<BaseT, RegionT>>
+    : std::conditional_t<is_simd_type_v<BaseT>, std::true_type,
+                         std::false_type> {};
+
+template <class Ty>
+struct is_simd_view_type : is_simd_view_type_impl<remove_cvref_t<Ty>> {};
 
 template <typename Ty>
-struct is_simd_v
-    : std::integral_constant<bool, is_simd_type<remove_cvref_t<Ty>>::value> {};
+static inline constexpr bool is_simd_view_type_v = is_simd_view_type<Ty>::value;
+// @}
+
+template <typename T>
+static inline constexpr bool is_simd_or_view_type_v =
+    is_simd_view_type_v<T> || is_simd_type_v<T>;
+
+// @{
+// Get the element type if it is a scalar, clang vector, simd or simd_view type.
+
+struct cant_deduce_element_type;
 
-// Get the element type if it is a simd or simd_view type.
-template <typename Ty> struct element_type { using type = remove_cvref_t<Ty>; };
-template <typename Ty, int N> struct element_type<simd<Ty, N>> {
-  using type = Ty;
+template <class T, class SFINAE = void> struct element_type {
+  using type = cant_deduce_element_type;
 };
-template <typename BaseTy, typename RegionTy>
-struct element_type<simd_view<BaseTy, RegionTy>> {
-  using type = typename RegionTy::element_type;
+
+template <typename T>
+struct element_type<T, std::enable_if_t<is_vectorizable_v<T>>> {
+  using type = remove_cvref_t<T>;
+};
+
+template <typename T>
+struct element_type<T, std::enable_if_t<is_simd_like_type_v<T>>> {
+  using type = typename T::element_type;
+};
+
+template <typename T>
+struct element_type<T, std::enable_if_t<is_clang_vector_type_v<T>>> {
+  using type = typename is_clang_vector_type<T>::element_type;
 };
 
-// Get the common type of a binary operator.
-template <typename T1, typename T2,
-          typename =
-              csd::enable_if_t<is_simd_v<T1>::value && is_simd_v<T2>::value>>
-struct common_type {
+// @}
+
+// @{
+// Get computation type of a binary operator given its operand types:
+// - if both types are arithmetic - return CPP's "common real type" of the
+//   computation (matches C++)
+// - if both types are simd types, they must be of the same length N,
+//   and the returned type is simd<T, N>, where N is the "common real type" of
+//   the element type of the operands (diverges from clang)
+// - otherwise, one type is simd and another is arithmetic - the simd type is
+//   returned (matches clang)
+
+struct invalid_computation_type;
+
+template <class T1, class T2, class SFINAE = void> struct computation_type {
+  using type = invalid_computation_type;
+};
+
+template <class T1, class T2>
+struct computation_type<
+    T1, T2, std::enable_if_t<is_vectorizable_v<T1> && is_vectorizable_v<T2>>> {
+  using type = decltype(std::declval<T1>() + std::declval<T2>());
+};
+
+template <class T1, class T2>
+struct computation_type<
+    T1, T2,
+    std::enable_if_t<is_simd_like_type_v<T1> && is_simd_like_type_v<T2>>> {
 private:
   using Ty1 = typename element_type<T1>::type;
   using Ty2 = typename element_type<T2>::type;
-  using EltTy = decltype(Ty1() + Ty2());
+  using EltTy = typename computation_type<Ty1, Ty2>::type;
   static constexpr int N1 = T1::length;
   static constexpr int N2 = T2::length;
   static_assert(N1 == N2, "size mismatch");
@@ -184,20 +411,18 @@ struct common_type {
   using type = simd<EltTy, N1>;
 };
 
-template <typename T1, typename T2 = T1>
-using compute_type_t =
-    typename common_type<remove_cvref_t<T1>, remove_cvref_t<T2>>::type;
+template <class T1, class T2 = T1>
+using computation_type_t =
+    typename computation_type<remove_cvref_t<T1>, remove_cvref_t<T2>>::type;
+
+// @}
 
-template <typename To, typename From> To convert(From Val) {
+template <typename To, typename From>
+std::enable_if_t<is_clang_vector_type_v<To> && is_clang_vector_type_v<From>, To>
+convert(From Val) {
   return __builtin_convertvector(Val, To);
 }
 
-/// Get the computation type.
-template <typename T1, typename T2> struct computation_type {
-  // Currently only arithmetic operations are needed.
-  typedef decltype(T1() + T2()) type;
-};
-
 /// Base case for checking if a type U is one of the types.
 template <typename U> constexpr bool is_type() { return false; }
 
@@ -248,18 +473,11 @@ inline std::istream &operator>>(std::istream &I, half &rhs) {
   rhs = ValFloat;
   return I;
 }
+
 } // namespace detail
 
-// TODO @rolandschulz on May 21
-// {quote}
-// - The mask should also be a wrapper around the clang - vector type rather
-//   than the clang - vector type itself.
-// - The internal storage should be implementation defined.uint16_t is a bad
-//   choice for some HW.Nor is it how clang - vector types works(using the same
-//   size int as the corresponding vector type used for comparison(e.g. long for
-//   double and int for float)).
-template <int N>
-using mask_type_t = typename detail::vector_type<uint16_t, N>::type;
+// Alias for backward compatibility.
+template <int N> using mask_type_t = detail::simd_mask_storage_t<N>;
 
 } // namespace esimd
 } // namespace experimental
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp
index c462f839e6ac5..021452042f097 100755
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp
@@ -15,6 +15,10 @@
 
 #include <type_traits>
 
+#define __SEIEED sycl::ext::intel::experimental::esimd::detail
+#define __SEIEE sycl::ext::intel::experimental::esimd
+#define __SEIEEED sycl::ext::intel::experimental::esimd::emu::detail
+
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace ext {
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp
index 5f87f1b4595c7..04d93061e02b5 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp
@@ -13,9 +13,11 @@
 #include <sycl/ext/intel/experimental/esimd/common.hpp>
 #include <sycl/ext/intel/experimental/esimd/detail/host_util.hpp>
 #include <sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp>
+#include <sycl/ext/intel/experimental/esimd/detail/operators.hpp>
 #include <sycl/ext/intel/experimental/esimd/detail/types.hpp>
 #include <sycl/ext/intel/experimental/esimd/detail/util.hpp>
 #include <sycl/ext/intel/experimental/esimd/simd.hpp>
+#include <sycl/ext/intel/experimental/esimd/simd_view.hpp>
 
 #include <cstdint>
 
@@ -61,7 +63,7 @@ namespace detail {
 template <typename T0, typename T1, int SZ>
 ESIMD_NODEBUG ESIMD_INLINE simd<T0, SZ>
 __esimd_abs_common_internal(simd<T1, SZ> src0, int flag = saturation_off) {
-  simd<T1, SZ> Result = __esimd_abs<T1, SZ>(src0.data());
+  simd<T1, SZ> Result = simd<T0, SZ>(__esimd_abs<T1, SZ>(src0.data()));
   if (flag != saturation_on)
     return Result;
 
@@ -96,7 +98,7 @@ ESIMD_NODEBUG ESIMD_INLINE typename sycl::detail::enable_if_t<
                   typename sycl::detail::remove_const_t<T1>>::value,
     simd<T0, SZ>>
 esimd_abs(simd<T1, SZ> src0, int flag = saturation_off) {
-  return detail::__esimd_abs_common_internal<T0, T1, SZ>(src0, flag);
+  return detail::__esimd_abs_common_internal<T0, T1, SZ>(src0.data(), flag);
 }
 
 /// Get absolute value (scalar version)
@@ -129,7 +131,7 @@ esimd_abs(T1 src0, int flag = saturation_off) {
 template <typename T1, int SZ>
 ESIMD_NODEBUG ESIMD_INLINE simd<T1, SZ> esimd_abs(simd<T1, SZ> src0,
                                                   int flag = saturation_off) {
-  return detail::__esimd_abs_common_internal<T1, T1, SZ>(src0, flag);
+  return detail::__esimd_abs_common_internal<T1, T1, SZ>(src0.data(), flag);
 }
 
 /// Get absolute value (scalar version). This is a specialization of a version
@@ -165,8 +167,7 @@ ESIMD_NODEBUG ESIMD_INLINE
                                            std::is_integral<U>::value,
                                        simd<T0, SZ>>
     esimd_shl(simd<T1, SZ> src0, U src1, int flag = saturation_off) {
-  typedef
-      typename detail::computation_type<decltype(src0), U>::type ComputationTy;
+  using ComputationTy = detail::computation_type_t<decltype(src0), U>;
   typename detail::simd_type<ComputationTy>::type Src0 = src0;
   typename detail::simd_type<ComputationTy>::type Src1 = src1;
 
@@ -213,7 +214,7 @@ ESIMD_NODEBUG ESIMD_INLINE typename sycl::detail::enable_if_t<
         std::is_integral<T1>::value && std::is_integral<T2>::value,
     typename sycl::detail::remove_const_t<T0>>
 esimd_shl(T1 src0, T2 src1, int flag = saturation_off) {
-  typedef typename detail::computation_type<T1, T2>::type ComputationTy;
+  using ComputationTy = detail::computation_type_t<T1, T2>;
   typename detail::simd_type<ComputationTy>::type Src0 = src0;
   typename detail::simd_type<ComputationTy>::type Src1 = src1;
   simd<T0, 1> Result = esimd_shl<T0>(Src0, Src1, flag);
@@ -237,8 +238,7 @@ ESIMD_NODEBUG ESIMD_INLINE
                                            std::is_integral<U>::value,
                                        simd<T0, SZ>>
     esimd_shr(simd<T1, SZ> src0, U src1, int flag = saturation_off) {
-  typedef
-      typename detail::computation_type<decltype(src0), U>::type ComputationTy;
+  using ComputationTy = detail::computation_type_t<decltype(src0), U>;
   typename detail::simd_type<ComputationTy>::type Src0 = src0;
   typename detail::simd_type<ComputationTy>::type Src1 = src1;
   typename detail::simd_type<ComputationTy>::type Result =
@@ -266,7 +266,7 @@ ESIMD_NODEBUG ESIMD_INLINE typename sycl::detail::enable_if_t<
         std::is_integral<T1>::value && std::is_integral<T2>::value,
     typename sycl::detail::remove_const_t<T0>>
 esimd_shr(T1 src0, T2 src1, int flag = saturation_off) {
-  typedef typename detail::computation_type<T1, T2>::type ComputationTy;
+  using ComputationTy = detail::computation_type_t<T1, T2>;
   typename detail::simd_type<ComputationTy>::type Src0 = src0;
   typename detail::simd_type<ComputationTy>::type Src1 = src1;
   simd<T0, 1> Result = esimd_shr<T0>(Src0, Src1, flag);
@@ -285,7 +285,7 @@ template <typename T0, typename T1, int SZ>
 ESIMD_NODEBUG ESIMD_INLINE typename sycl::detail::enable_if_t<
     std::is_integral<T0>::value && std::is_integral<T1>::value, simd<T0, SZ>>
 esimd_rol(simd<T1, SZ> src0, simd<T1, SZ> src1) {
-  return __esimd_rol<T0, T1, SZ>(src0, src1);
+  return __esimd_rol<T0, T1, SZ>(src0.data(), src1.data());
 }
 
 /// Rotate left operation with a vector and a scalar inputs
@@ -303,8 +303,7 @@ ESIMD_NODEBUG ESIMD_INLINE
                                            std::is_integral<U>::value,
                                        simd<T0, SZ>>
     esimd_rol(simd<T1, SZ> src0, U src1) {
-  typedef
-      typename detail::computation_type<decltype(src0), U>::type ComputationTy;
+  using ComputationTy = detail::computation_type_t<decltype(src0), U>;
   typename detail::simd_type<ComputationTy>::type Src0 = src0;
   typename detail::simd_type<ComputationTy>::type Src1 = src1;
   return __esimd_rol<T0>(Src0.data(), Src1.data());
@@ -324,7 +323,7 @@ ESIMD_NODEBUG ESIMD_INLINE typename sycl::detail::enable_if_t<
         std::is_integral<T1>::value && std::is_integral<T2>::value,
     typename sycl::detail::remove_const_t<T0>>
 esimd_rol(T1 src0, T2 src1) {
-  typedef typename detail::computation_type<T1, T2>::type ComputationTy;
+  using ComputationTy = detail::computation_type_t<T1, T2>;
   typename detail::simd_type<ComputationTy>::type Src0 = src0;
   typename detail::simd_type<ComputationTy>::type Src1 = src1;
   simd<T0, 1> Result = esimd_rol<T0>(Src0, Src1);
@@ -343,7 +342,7 @@ template <typename T0, typename T1, int SZ>
 ESIMD_NODEBUG ESIMD_INLINE typename sycl::detail::enable_if_t<
     std::is_integral<T0>::value && std::is_integral<T1>::value, simd<T0, SZ>>
 esimd_ror(simd<T1, SZ> src0, simd<T1, SZ> src1) {
-  return __esimd_ror<T0, T1, SZ>(src0, src1);
+  return __esimd_ror<T0, T1, SZ>(src0.data(), src1.data());
 }
 
 /// Rotate right operation with a vector and a scalar inputs
@@ -361,8 +360,7 @@ ESIMD_NODEBUG ESIMD_INLINE
                                            std::is_integral<U>::value,
                                        simd<T0, SZ>>
     esimd_ror(simd<T1, SZ> src0, U src1) {
-  typedef
-      typename detail::computation_type<decltype(src0), U>::type ComputationTy;
+  using ComputationTy = detail::computation_type_t<decltype(src0), U>;
   typename detail::simd_type<ComputationTy>::type Src0 = src0;
   typename detail::simd_type<ComputationTy>::type Src1 = src1;
   return __esimd_ror<T0>(Src0.data(), Src1.data());
@@ -382,7 +380,7 @@ ESIMD_NODEBUG ESIMD_INLINE typename sycl::detail::enable_if_t<
         std::is_integral<T1>::value && std::is_integral<T2>::value,
     typename sycl::detail::remove_const_t<T0>>
 esimd_ror(T1 src0, T2 src1) {
-  typedef typename detail::computation_type<T1, T2>::type ComputationTy;
+  using ComputationTy = detail::computation_type_t<T1, T2>;
   typename detail::simd_type<ComputationTy>::type Src0 = src0;
   typename detail::simd_type<ComputationTy>::type Src1 = src1;
   simd<T0, 1> Result = esimd_ror<T0>(Src0, Src1);
@@ -406,7 +404,7 @@ ESIMD_NODEBUG ESIMD_INLINE
                                            std::is_integral<U>::value,
                                        simd<T0, SZ>>
     esimd_lsr(simd<T1, SZ> src0, U src1, int flag = saturation_off) {
-  typedef typename detail::computation_type<T1, T1>::type IntermedTy;
+  using IntermedTy = detail::computation_type_t<T1, T1>;
   typedef typename std::make_unsigned<IntermedTy>::type ComputationTy;
   simd<ComputationTy, SZ> Src0 = src0;
   simd<ComputationTy, SZ> Result = Src0.data() >> src1.data();
@@ -434,7 +432,7 @@ ESIMD_NODEBUG ESIMD_INLINE typename sycl::detail::enable_if_t<
         std::is_integral<T1>::value && std::is_integral<T2>::value,
     typename sycl::detail::remove_const_t<T0>>
 esimd_lsr(T1 src0, T2 src1, int flag = saturation_off) {
-  typedef typename detail::computation_type<T1, T2>::type ComputationTy;
+  using ComputationTy = detail::computation_type_t<T1, T2>;
   typename detail::simd_type<ComputationTy>::type Src0 = src0;
   typename detail::simd_type<ComputationTy>::type Src1 = src1;
   simd<T0, 1> Result = esimd_lsr<T0>(Src0, Src1, flag);
@@ -458,7 +456,7 @@ ESIMD_NODEBUG ESIMD_INLINE
                                            std::is_integral<U>::value,
                                        simd<T0, SZ>>
     esimd_asr(simd<T1, SZ> src0, U src1, int flag = saturation_off) {
-  typedef typename detail::computation_type<T1, T1>::type IntermedTy;
+  using IntermedTy = detail::computation_type_t<T1, T1>;
   typedef typename std::make_signed<IntermedTy>::type ComputationTy;
   simd<ComputationTy, SZ> Src0 = src0;
   simd<ComputationTy, SZ> Result = Src0 >> src1;
@@ -486,7 +484,7 @@ ESIMD_NODEBUG ESIMD_INLINE typename sycl::detail::enable_if_t<
         std::is_integral<T1>::value && std::is_integral<T2>::value,
     typename sycl::detail::remove_const_t<T0>>
 esimd_asr(T1 src0, T2 src1, int flag = saturation_off) {
-  typedef typename detail::computation_type<T1, T2>::type ComputationTy;
+  using ComputationTy = detail::computation_type_t<T1, T2>;
   typename detail::simd_type<ComputationTy>::type Src0 = src0;
   typename detail::simd_type<ComputationTy>::type Src1 = src1;
   simd<T0, 1> Result = esimd_asr<T0>(Src0, Src1, flag);
@@ -503,8 +501,7 @@ ESIMD_NODEBUG ESIMD_INLINE
                                            detail::is_dword_type<U>::value,
                                        simd<T0, SZ>>
     esimd_imul(simd<T0, SZ> &rmd, simd<T1, SZ> src0, U src1) {
-  typedef
-      typename detail::computation_type<decltype(src0), U>::type ComputationTy;
+  using ComputationTy = detail::computation_type_t<decltype(src0), U>;
   typename detail::simd_type<ComputationTy>::type Src0 = src0;
   typename detail::simd_type<ComputationTy>::type Src1 = src1;
   rmd = Src0 * Src1;
@@ -524,8 +521,7 @@ ESIMD_NODEBUG ESIMD_INLINE typename sycl::detail::enable_if_t<
         detail::is_dword_type<U>::value && SZ == 1,
     simd<T0, SZ>>
 esimd_imul(simd<T0, SZ> &rmd, simd<T1, SZ> src0, U src1) {
-  typedef typename detail::computation_type<decltype(rmd), long long>::type
-      ComputationTy;
+  using ComputationTy = detail::computation_type_t<decltype(rmd), long long>;
   ComputationTy Product = convert<long long>(src0);
   Product *= src1;
   rmd = Product.bit_cast_view<T0>().select<1, 1>[0];
@@ -538,8 +534,7 @@ ESIMD_NODEBUG ESIMD_INLINE typename sycl::detail::enable_if_t<
         detail::is_dword_type<U>::value && SZ != 1,
     simd<T0, SZ>>
 esimd_imul(simd<T0, SZ> &rmd, simd<T1, SZ> src0, U src1) {
-  typedef typename detail::computation_type<decltype(rmd), long long>::type
-      ComputationTy;
+  using ComputationTy = detail::computation_type_t<decltype(rmd), long long>;
   ComputationTy Product = convert<long long>(src0);
   Product *= src1;
   rmd = Product.bit_cast_view<T0>().select<SZ, 2>(0);
@@ -697,15 +692,18 @@ ESIMD_NODEBUG ESIMD_INLINE simd<T, SZ>
 esimd_max(simd<T, SZ> src0, simd<T, SZ> src1, int flag = saturation_off) {
   if constexpr (std::is_floating_point<T>::value) {
     auto Result = __esimd_fmax<T, SZ>(src0.data(), src1.data());
-    return (flag == saturation_off) ? Result : __esimd_satf<T, T, SZ>(Result);
+    Result = (flag == saturation_off) ? Result : __esimd_satf<T, T, SZ>(Result);
+    return simd<T, SZ>(Result);
   } else if constexpr (std::is_unsigned<T>::value) {
     auto Result = __esimd_umax<T, SZ>(src0.data(), src1.data());
-    return (flag == saturation_off) ? Result
-                                    : __esimd_uutrunc_sat<T, T, SZ>(Result);
+    Result = (flag == saturation_off) ? Result
+                                      : __esimd_uutrunc_sat<T, T, SZ>(Result);
+    return simd<T, SZ>(Result);
   } else {
     auto Result = __esimd_smax<T, SZ>(src0.data(), src1.data());
-    return (flag == saturation_off) ? Result
-                                    : __esimd_sstrunc_sat<T, T, SZ>(Result);
+    Result = (flag == saturation_off) ? Result
+                                      : __esimd_sstrunc_sat<T, T, SZ>(Result);
+    return simd<T, SZ>(Result);
   }
 }
 
@@ -781,15 +779,18 @@ ESIMD_NODEBUG ESIMD_INLINE simd<T, SZ>
 esimd_min(simd<T, SZ> src0, simd<T, SZ> src1, int flag = saturation_off) {
   if constexpr (std::is_floating_point<T>::value) {
     auto Result = __esimd_fmin<T, SZ>(src0.data(), src1.data());
-    return (flag == saturation_off) ? Result : __esimd_satf<T, T, SZ>(Result);
+    Result = (flag == saturation_off) ? Result : __esimd_satf<T, T, SZ>(Result);
+    return simd<T, SZ>(Result);
   } else if constexpr (std::is_unsigned<T>::value) {
     auto Result = __esimd_umin<T, SZ>(src0.data(), src1.data());
-    return (flag == saturation_off) ? Result
-                                    : __esimd_uutrunc_sat<T, T, SZ>(Result);
+    Result = (flag == saturation_off) ? Result
+                                      : __esimd_uutrunc_sat<T, T, SZ>(Result);
+    return simd<T, SZ>(Result);
   } else {
     auto Result = __esimd_smin<T, SZ>(src0.data(), src1.data());
-    return (flag == saturation_off) ? Result
-                                    : __esimd_sstrunc_sat<T, T, SZ>(Result);
+    Result = (flag == saturation_off) ? Result
+                                      : __esimd_sstrunc_sat<T, T, SZ>(Result);
+    return simd<T, SZ>(Result);
   }
 }
 
@@ -1207,7 +1208,7 @@ ESIMD_NODEBUG ESIMD_INLINE
 template <typename T, int SZ>
 ESIMD_NODEBUG ESIMD_INLINE simd<T, SZ> esimd_frc(simd<T, SZ> src0) {
   simd<float, SZ> Src0 = src0;
-  return __esimd_frc(Src0);
+  return __esimd_frc(Src0.data());
 }
 
 /// Performs truncate-to-minus-infinity fraction operation of \p src0.
@@ -1227,7 +1228,7 @@ ESIMD_NODEBUG ESIMD_INLINE simd<RT, SZ> esimd_lzd(simd<T0, SZ> src0,
                                                   int flag = saturation_off) {
   // Saturation parameter ignored
   simd<uint, SZ> Src0 = src0;
-  return __esimd_lzd<uint>(Src0);
+  return __esimd_lzd<uint>(Src0.data());
 }
 
 template <typename RT, typename T0>
@@ -1251,7 +1252,8 @@ esimd_lrp(simd<float, SZ> src0, U src1, V src2, int flag = saturation_off) {
                 "vector size must be a multiple of 4");
   simd<float, SZ> Src1 = src1;
   simd<float, SZ> Src2 = src2;
-  simd<float, SZ> Result = __esimd_lrp<SZ>(src0, Src1, Src2);
+  simd<float, SZ> Result =
+      __esimd_lrp<SZ>(src0.data(), Src1.data(), Src2.data());
 
   if (flag != saturation_on)
     return Result;
@@ -1309,7 +1311,7 @@ esimd_pln(simd<float, 4> src0, simd<float, SZ> src1, simd<float, SZ> src2,
   Src12.select<(SZ >> 3), 1, 8, 1>(0, 8) =
       src2.template bit_cast_view<float, (SZ >> 3), 8>();
 
-  simd<float, SZ> Result = __esimd_pln<SZ>(src0, Src12.read());
+  simd<float, SZ> Result = __esimd_pln<SZ>(src0.data(), Src12.read().data());
 
   if (flag != saturation_on)
     return Result;
@@ -1321,7 +1323,7 @@ esimd_pln(simd<float, 4> src0, simd<float, SZ> src1, simd<float, SZ> src2,
 template <typename T0, typename T1, int SZ>
 ESIMD_NODEBUG ESIMD_INLINE simd<T0, SZ> esimd_bf_reverse(simd<T1, SZ> src0) {
   simd<unsigned, SZ> Src0 = src0;
-  return __esimd_bfrev<unsigned>(Src0);
+  return __esimd_bfrev<unsigned>(Src0.data());
 }
 
 template <typename T0, typename T1>
@@ -1348,7 +1350,7 @@ ESIMD_NODEBUG ESIMD_INLINE
   simd<DT1, SZ> Src2 = src2;
   simd<DT1, SZ> Src3 = src3;
 
-  return __esimd_bfins<DT1>(Src0, Src1, Src2, Src3);
+  return __esimd_bfins<DT1>(Src0.data(), Src1.data(), Src2.data(), Src3.data());
 }
 
 template <typename T0, typename T1, typename T2, typename T3, typename T4>
@@ -1374,7 +1376,7 @@ ESIMD_NODEBUG ESIMD_INLINE
   simd<DT1, SZ> Src1 = src1;
   simd<DT1, SZ> Src2 = src2;
 
-  return __esimd_bfext<DT1>(Src0, Src1, Src2);
+  return __esimd_bfext<DT1>(Src0.data(), Src1.data(), Src2.data());
 }
 
 template <typename T0, typename T1, typename T2, typename T3>
@@ -1505,8 +1507,8 @@ ESIMD_NODEBUG ESIMD_INLINE
     esimd_atan(simd<T, SZ> src0, int flag = saturation_off) {
   simd<T, SZ> Src0 = esimd_abs(src0);
 
-  simd<ushort, SZ> Neg = src0 < T(0.0);
-  simd<ushort, SZ> Gt1 = Src0 > T(1.0);
+  simd_mask<SZ> Neg = src0 < T(0.0);
+  simd_mask<SZ> Gt1 = Src0 > T(1.0);
 
   Src0.merge(esimd_inv(Src0), Gt1);
 
@@ -1548,8 +1550,8 @@ ESIMD_NODEBUG ESIMD_INLINE
     esimd_acos(simd<T, SZ> src0, int flag = saturation_off) {
   simd<T, SZ> Src0 = esimd_abs(src0);
 
-  simd<ushort, SZ> Neg = src0 < T(0.0);
-  simd<ushort, SZ> TooBig = Src0 >= T(0.999998);
+  simd_mask<SZ> Neg = src0 < T(0.0);
+  simd_mask<SZ> TooBig = Src0 >= T(0.999998);
 
   // Replace oversized values to ensure no possibility of sqrt of
   // a negative value later
@@ -1591,7 +1593,7 @@ ESIMD_NODEBUG ESIMD_INLINE
     typename sycl::detail::enable_if_t<std::is_floating_point<T>::value,
                                        simd<T, SZ>>
     esimd_asin(simd<T, SZ> src0, int flag = saturation_off) {
-  simd<ushort, SZ> Neg = src0 < T(0.0);
+  simd_mask<SZ> Neg = src0 < T(0.0);
 
   simd<T, SZ> Result =
       T(ESIMD_HDR_CONST_PI / 2.0) - esimd_acos(esimd_abs(src0));
@@ -1644,14 +1646,14 @@ ESIMD_INTRINSIC_DEF(rndz)
 template <int N>
 ESIMD_NODEBUG ESIMD_INLINE
     typename sycl::detail::enable_if_t<(N == 8 || N == 16 || N == 32), uint>
-    esimd_pack_mask(simd<ushort, N> src0) {
+    esimd_pack_mask(simd_mask<N> src0) {
   return __esimd_pack_mask<N>(src0.data());
 }
 
 template <int N>
 ESIMD_NODEBUG ESIMD_INLINE
     typename sycl::detail::enable_if_t<(N == 8 || N == 16 || N == 32),
-                                       simd<ushort, N>>
+                                       simd_mask<N>>
     esimd_unpack_mask(uint src0) {
   return __esimd_unpack_mask<N>(src0);
 }
@@ -1659,8 +1661,8 @@ ESIMD_NODEBUG ESIMD_INLINE
 template <int N>
 ESIMD_NODEBUG ESIMD_INLINE
     typename sycl::detail::enable_if_t<(N != 8 && N != 16 && N < 32), uint>
-    esimd_pack_mask(simd<ushort, N> src0) {
-  simd<ushort, (N < 8 ? 8 : N < 16 ? 16 : 32)> src_0 = 0;
+    esimd_pack_mask(simd_mask<N> src0) {
+  simd_mask<(N < 8 ? 8 : N < 16 ? 16 : 32)> src_0 = 0;
   src_0.template select<N, 1>() = src0.template bit_cast_view<ushort>();
   return esimd_pack_mask(src_0);
 }
@@ -1973,7 +1975,7 @@ ESIMD_INLINE simd<float, N> esimd_atan2_fast(simd<float, N> y, simd<float, N> x,
   simd<float, N> a1;
   simd<float, N> atan2;
 
-  simd<unsigned short, N> mask = (y >= 0.0f);
+  simd_mask<N> mask = (y >= 0.0f);
   a0.merge(ESIMD_CONST_PI * 0.5f, ESIMD_CONST_PI * 1.5f, mask);
   a1.merge(0, ESIMD_CONST_PI * 2.0f, mask);
 
@@ -2009,7 +2011,7 @@ ESIMD_INLINE simd<float, N> esimd_atan2(simd<float, N> y, simd<float, N> x,
   simd<float, N> v_distance;
   simd<float, N> v_y0;
   simd<float, N> atan2;
-  simd<unsigned short, N> mask;
+  simd_mask<N> mask;
 
   mask = (x < 0);
   v_y0.merge(ESIMD_CONST_PI, 0, mask);
@@ -2027,10 +2029,10 @@ template <> ESIMD_INLINE float esimd_atan2(float y, float x, const uint flags) {
   float v_distance;
   float v_y0;
   simd<float, 1> atan2;
-  unsigned short mask;
+  simd_mask<1> mask;
 
   mask = (x < 0);
-  v_y0 = mask ? ESIMD_CONST_PI : 0;
+  v_y0 = mask[0] ? ESIMD_CONST_PI : 0;
   v_distance = esimd_sqrt<float>(x * x + y * y);
   mask = (esimd_abs<float>(y) < 0.000001f);
   atan2.merge(v_y0, (2 * esimd_atan((v_distance - x) / y)), mask);
@@ -2345,11 +2347,11 @@ template <typename T0, typename T1, int SZ> struct esimd_apply_reduced_max {
   template <typename... T>
   simd<T0, SZ> operator()(simd<T1, SZ> v1, simd<T1, SZ> v2) {
     if constexpr (std::is_floating_point<T1>::value) {
-      return __esimd_reduced_fmax<T1, SZ>(v1, v2);
+      return __esimd_reduced_fmax<T1, SZ>(v1.data(), v2.data());
     } else if constexpr (std::is_unsigned<T1>::value) {
-      return __esimd_reduced_umax<T1, SZ>(v1, v2);
+      return __esimd_reduced_umax<T1, SZ>(v1.data(), v2.data());
     } else {
-      return __esimd_reduced_smax<T1, SZ>(v1, v2);
+      return __esimd_reduced_smax<T1, SZ>(v1.data(), v2.data());
     }
   }
 };
@@ -2358,11 +2360,11 @@ template <typename T0, typename T1, int SZ> struct esimd_apply_reduced_min {
   template <typename... T>
   simd<T0, SZ> operator()(simd<T1, SZ> v1, simd<T1, SZ> v2) {
     if constexpr (std::is_floating_point<T1>::value) {
-      return __esimd_reduced_fmin<T1, SZ>(v1, v2);
+      return __esimd_reduced_fmin<T1, SZ>(v1.data(), v2.data());
     } else if constexpr (std::is_unsigned<T1>::value) {
-      return __esimd_reduced_umin<T1, SZ>(v1, v2);
+      return __esimd_reduced_umin<T1, SZ>(v1.data(), v2.data());
     } else {
-      return __esimd_reduced_smin<T1, SZ>(v1, v2);
+      return __esimd_reduced_smin<T1, SZ>(v1.data(), v2.data());
     }
   }
 };
@@ -2426,7 +2428,7 @@ T0 esimd_reduce(simd<T1, SZ> v) {
 
 template <typename T0, typename T1, int SZ>
 ESIMD_INLINE ESIMD_NODEBUG T0 esimd_sum(simd<T1, SZ> v) {
-  using TT = compute_type_t<simd<T1, SZ>>;
+  using TT = detail::computation_type_t<simd<T1, SZ>>;
   using RT = typename TT::element_type;
   T0 retv = esimd_reduce<RT, T1, SZ, esimd_apply_sum>(v);
   return retv;
@@ -2434,7 +2436,7 @@ ESIMD_INLINE ESIMD_NODEBUG T0 esimd_sum(simd<T1, SZ> v) {
 
 template <typename T0, typename T1, int SZ>
 ESIMD_INLINE ESIMD_NODEBUG T0 esimd_prod(simd<T1, SZ> v) {
-  using TT = compute_type_t<simd<T1, SZ>>;
+  using TT = detail::computation_type_t<simd<T1, SZ>>;
   using RT = typename TT::element_type;
   T0 retv = esimd_reduce<RT, T1, SZ, esimd_apply_prod>(v);
   return retv;
@@ -2471,7 +2473,7 @@ ESIMD_INLINE ESIMD_NODEBUG T0 reduce(simd<T1, SZ> v, BinaryOperation op) {
 
 template <typename T, int N>
 simd<T, N> esimd_dp4(simd<T, N> v1, simd<T, N> v2) {
-  auto retv = __esimd_dp4<T, N>(v1, v2);
+  auto retv = __esimd_dp4<T, N>(v1.data(), v2.data());
   return retv;
 }
 
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
index 02cfd986a0410..96b6e7bbfbcdb 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
@@ -76,7 +76,7 @@ ESIMD_INLINE ESIMD_NODEBUG typename sycl::detail::enable_if_t<
     ((n == 8 || n == 16 || n == 32) &&
      (ElemsPerAddr == 1 || ElemsPerAddr == 2 || ElemsPerAddr == 4)),
     simd<T, n * ElemsPerAddr>>
-gather(T *p, simd<uint32_t, n> offsets, simd<uint16_t, n> pred = 1) {
+gather(T *p, simd<uint32_t, n> offsets, simd_mask<n> pred = 1) {
 
   simd<uint64_t, n> offsets_i = convert<uint64_t>(offsets);
   simd<uint64_t, n> addrs(reinterpret_cast<uint64_t>(p));
@@ -126,7 +126,7 @@ ESIMD_INLINE ESIMD_NODEBUG typename sycl::detail::enable_if_t<
      (ElemsPerAddr == 1 || ElemsPerAddr == 2 || ElemsPerAddr == 4)),
     void>
 scatter(T *p, simd<T, n * ElemsPerAddr> vals, simd<uint32_t, n> offsets,
-        simd<uint16_t, n> pred = 1) {
+        simd_mask<n> pred = 1) {
   simd<uint64_t, n> offsets_i = convert<uint64_t>(offsets);
   simd<uint64_t, n> addrs(reinterpret_cast<uint64_t>(p));
   addrs = addrs + offsets_i;
@@ -271,21 +271,21 @@ ESIMD_INLINE ESIMD_NODEBUG
     const auto surf_ind = detail::AccessorPrivateProxy::getNativeImageObj(acc);
     const simd<PromoT, N> promo_vals =
         __esimd_surf_read<PromoT, N, decltype(surf_ind), TypeSizeLog2, L1H,
-                          L3H>(scale, surf_ind, glob_offset, offsets);
+                          L3H>(scale, surf_ind, glob_offset, offsets.data());
 #else
     const simd<PromoT, N> promo_vals =
         __esimd_surf_read<PromoT, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
-            scale, acc, glob_offset, offsets);
+            scale, acc, glob_offset, offsets.data());
 #endif
     return convert<T>(promo_vals);
   } else {
 #if defined(__SYCL_DEVICE_ONLY__)
     const auto surf_ind = detail::AccessorPrivateProxy::getNativeImageObj(acc);
     return __esimd_surf_read<T, N, decltype(surf_ind), TypeSizeLog2, L1H, L3H>(
-        scale, surf_ind, glob_offset, offsets);
+        scale, surf_ind, glob_offset, offsets.data());
 #else
     return __esimd_surf_read<T, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
-        scale, acc, glob_offset, offsets);
+        scale, acc, glob_offset, offsets.data());
 #endif
   }
 }
@@ -317,7 +317,7 @@ ESIMD_INLINE ESIMD_NODEBUG
                                            !std::is_pointer<AccessorTy>::value,
                                        void>
     scatter(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
-            uint32_t glob_offset = 0, simd<uint16_t, N> pred = 1) {
+            uint32_t glob_offset = 0, simd_mask<N> pred = simd_mask<N>(1)) {
 
   constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
   // TODO (performance) use hardware-supported scale once BE supports it
@@ -338,19 +338,21 @@ ESIMD_INLINE ESIMD_NODEBUG
 #if defined(__SYCL_DEVICE_ONLY__)
     const auto surf_ind = detail::AccessorPrivateProxy::getNativeImageObj(acc);
     __esimd_surf_write<PromoT, N, decltype(surf_ind), TypeSizeLog2, L1H, L3H>(
-        pred, scale, surf_ind, glob_offset, offsets, promo_vals);
+        pred.data(), scale, surf_ind, glob_offset, offsets.data(),
+        promo_vals.data());
 #else
     __esimd_surf_write<PromoT, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
-        pred, scale, acc, glob_offset, offsets, promo_vals);
+        pred.data(), scale, acc, glob_offset, offsets.data(),
+        promo_vals.data());
 #endif
   } else {
 #if defined(__SYCL_DEVICE_ONLY__)
     const auto surf_ind = detail::AccessorPrivateProxy::getNativeImageObj(acc);
     __esimd_surf_write<T, N, decltype(surf_ind), TypeSizeLog2, L1H, L3H>(
-        pred, scale, surf_ind, glob_offset, offsets, vals);
+        pred.data(), scale, surf_ind, glob_offset, offsets.data(), vals.data());
 #else
     __esimd_surf_write<T, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
-        pred, scale, acc, glob_offset, offsets, vals);
+        pred.data(), scale, acc, glob_offset, offsets.data(), vals.data());
 #endif
   }
 }
@@ -360,7 +362,7 @@ ESIMD_INLINE ESIMD_NODEBUG
 template <typename T, typename AccessorTy, CacheHint L1H = CacheHint::None,
           CacheHint L3H = CacheHint::None>
 ESIMD_INLINE ESIMD_NODEBUG T scalar_load(AccessorTy acc, uint32_t offset) {
-  const simd<T, 1> Res = gather<T>(acc, simd<uint32_t, 1>{offset});
+  const simd<T, 1> Res = gather<T>(acc, simd<uint32_t, 1>(offset));
   return Res[0];
 }
 
@@ -370,7 +372,7 @@ template <typename T, typename AccessorTy, CacheHint L1H = CacheHint::None,
           CacheHint L3H = CacheHint::None>
 ESIMD_INLINE ESIMD_NODEBUG void scalar_store(AccessorTy acc, uint32_t offset,
                                              T val) {
-  scatter<T>(acc, simd<T, 1>{val}, simd<uint32_t, 1>{offset});
+  scatter<T>(acc, simd<T, 1>(val), simd<uint32_t, 1>(offset));
 }
 
 /// Gathering read for the given starting pointer \p p and \p offsets.
@@ -388,7 +390,7 @@ template <typename T, int N, rgba_channel_mask Mask,
 ESIMD_INLINE ESIMD_NODEBUG typename sycl::detail::enable_if_t<
     (N == 16 || N == 32) && (sizeof(T) == 4),
     simd<T, N * get_num_channels_enabled(Mask)>>
-gather_rgba(T *p, simd<uint32_t, N> offsets, simd<uint16_t, N> pred = 1) {
+gather_rgba(T *p, simd<uint32_t, N> offsets, simd_mask<N> pred = 1) {
 
   simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
   simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
@@ -407,8 +409,8 @@ ESIMD_INLINE ESIMD_NODEBUG typename sycl::detail::enable_if_t<
     simd<T, n * get_num_channels_enabled(Mask)>> gather4(T *p,
                                                          simd<uint32_t, n>
                                                              offsets,
-                                                         simd<uint16_t, n>
-                                                             pred = 1) {
+                                                         simd_mask<n> pred =
+                                                             1) {
   return gather_rgba<T, n, Mask, L1H, L3H>(p, offsets, pred);
 }
 
@@ -429,7 +431,7 @@ ESIMD_INLINE ESIMD_NODEBUG
     typename sycl::detail::enable_if_t<(N == 16 || N == 32) && (sizeof(T) == 4),
                                        void>
     scatter_rgba(T *p, simd<T, N * get_num_channels_enabled(Mask)> vals,
-                 simd<uint32_t, N> offsets, simd<uint16_t, N> pred = 1) {
+                 simd<uint32_t, N> offsets, simd_mask<N> pred = 1) {
   simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
   simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
   addrs = addrs + offsets_i;
@@ -445,7 +447,7 @@ __SYCL_DEPRECATED("use scatter_rgba.")
 ESIMD_INLINE ESIMD_NODEBUG typename sycl::detail::enable_if_t<
     (n == 16 || n == 32) && (sizeof(T) == 4),
     void> scatter4(T *p, simd<T, n * get_num_channels_enabled(Mask)> vals,
-                   simd<uint32_t, n> offsets, simd<uint16_t, n> pred = 1) {
+                   simd<uint32_t, n> offsets, simd_mask<n> pred = 1) {
   scatter_rgba<T, n, Mask, L1H, L3H>(p, vals, offsets, pred);
 }
 
@@ -555,7 +557,7 @@ template <atomic_op Op, typename T, int n, CacheHint L1H = CacheHint::None,
 ESIMD_NODEBUG ESIMD_INLINE
     typename sycl::detail::enable_if_t<detail::check_atomic<Op, T, n, 0>(),
                                        simd<T, n>>
-    flat_atomic(T *p, simd<unsigned, n> offset, simd<ushort, n> pred) {
+    flat_atomic(T *p, simd<unsigned, n> offset, simd_mask<n> pred) {
   simd<uintptr_t, n> vAddr(reinterpret_cast<uintptr_t>(p));
   simd<uintptr_t, n> offset_i1 = convert<uintptr_t>(offset);
   vAddr += offset_i1;
@@ -570,7 +572,7 @@ ESIMD_NODEBUG ESIMD_INLINE
     typename sycl::detail::enable_if_t<detail::check_atomic<Op, T, n, 1>(),
                                        simd<T, n>>
     flat_atomic(T *p, simd<unsigned, n> offset, simd<T, n> src0,
-                simd<ushort, n> pred) {
+                simd_mask<n> pred) {
   simd<uintptr_t, n> vAddr(reinterpret_cast<uintptr_t>(p));
   simd<uintptr_t, n> offset_i1 = convert<uintptr_t>(offset);
   vAddr += offset_i1;
@@ -586,7 +588,7 @@ ESIMD_NODEBUG ESIMD_INLINE
     typename sycl::detail::enable_if_t<detail::check_atomic<Op, T, n, 2>(),
                                        simd<T, n>>
     flat_atomic(T *p, simd<unsigned, n> offset, simd<T, n> src0,
-                simd<T, n> src1, simd<ushort, n> pred) {
+                simd<T, n> src1, simd_mask<n> pred) {
   simd<uintptr_t, n> vAddr(reinterpret_cast<uintptr_t>(p));
   simd<uintptr_t, n> offset_i1 = convert<uintptr_t>(offset);
   vAddr += offset_i1;
@@ -655,8 +657,8 @@ SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void slm_init(uint32_t size);
 template <typename T, int n>
 ESIMD_INLINE ESIMD_NODEBUG
     typename sycl::detail::enable_if_t<(n == 16 || n == 32), simd<T, n>>
-    slm_load(simd<uint32_t, n> offsets, simd<uint16_t, n> pred = 1) {
-  return __esimd_slm_read<T, n>(offsets.data(), pred.data());
+    slm_load(simd<uint32_t, n> offsets, simd_mask<n> Pred = 1) {
+  return __esimd_slm_read<T, n>(offsets.data(), Pred.data());
 }
 
 /// SLM scatter.
@@ -664,7 +666,7 @@ template <typename T, int n>
 ESIMD_INLINE ESIMD_NODEBUG
     typename sycl::detail::enable_if_t<(n == 16 || n == 32), void>
     slm_store(simd<T, n> vals, simd<uint32_t, n> offsets,
-              simd<uint16_t, n> pred = 1) {
+              simd_mask<n> pred = 1) {
   __esimd_slm_write<T, n>(offsets.data(), vals.data(), pred.data());
 }
 
@@ -675,7 +677,7 @@ template <typename T, int n, rgba_channel_mask Mask>
 ESIMD_INLINE ESIMD_NODEBUG typename sycl::detail::enable_if_t<
     (n == 8 || n == 16 || n == 32) && (sizeof(T) == 4),
     simd<T, n * get_num_channels_enabled(Mask)>>
-slm_load4(simd<uint32_t, n> offsets, simd<uint16_t, n> pred = 1) {
+slm_load4(simd<uint32_t, n> offsets, simd_mask<n> pred = 1) {
   return __esimd_slm_read4<T, n, Mask>(offsets.data(), pred.data());
 }
 
@@ -684,7 +686,7 @@ template <typename T, int n, rgba_channel_mask Mask>
 ESIMD_INLINE ESIMD_NODEBUG typename sycl::detail::enable_if_t<
     (n == 8 || n == 16 || n == 32) && (sizeof(T) == 4), void>
 slm_store4(simd<T, n * get_num_channels_enabled(Mask)> vals,
-           simd<uint32_t, n> offsets, simd<uint16_t, n> pred = 1) {
+           simd<uint32_t, n> offsets, simd_mask<n> pred = 1) {
   __esimd_slm_write4<T, n, Mask>(offsets.data(), vals.data(), pred.data());
 }
 
@@ -727,7 +729,7 @@ template <atomic_op Op, typename T, int n>
 ESIMD_NODEBUG ESIMD_INLINE
     typename sycl::detail::enable_if_t<detail::check_atomic<Op, T, n, 0>(),
                                        simd<T, n>>
-    slm_atomic(simd<uint32_t, n> offsets, simd<ushort, n> pred) {
+    slm_atomic(simd<uint32_t, n> offsets, simd_mask<n> pred) {
   return __esimd_slm_atomic0<Op, T, n>(offsets.data(), pred.data());
 }
 
@@ -736,8 +738,7 @@ template <atomic_op Op, typename T, int n>
 ESIMD_NODEBUG ESIMD_INLINE
     typename sycl::detail::enable_if_t<detail::check_atomic<Op, T, n, 1>(),
                                        simd<T, n>>
-    slm_atomic(simd<uint32_t, n> offsets, simd<T, n> src0,
-               simd<ushort, n> pred) {
+    slm_atomic(simd<uint32_t, n> offsets, simd<T, n> src0, simd_mask<n> pred) {
   return __esimd_slm_atomic1<Op, T, n>(offsets.data(), src0.data(),
                                        pred.data());
 }
@@ -748,7 +749,7 @@ ESIMD_NODEBUG ESIMD_INLINE
     typename sycl::detail::enable_if_t<detail::check_atomic<Op, T, n, 2>(),
                                        simd<T, n>>
     slm_atomic(simd<uint32_t, n> offsets, simd<T, n> src0, simd<T, n> src1,
-               simd<ushort, n> pred) {
+               simd_mask<n> pred) {
   return __esimd_slm_atomic2<Op, T, n>(offsets.data(), src0.data(), src1.data(),
                                        pred.data());
 }
@@ -830,14 +831,15 @@ media_block_store(AccessorTy acc, unsigned x, unsigned y, simd<T, m * n> vals) {
     temp_ref.template select<m, 1, n, 1>() = vals_ref;
     __esimd_media_block_store<T, m, n1>(
         0, detail::AccessorPrivateProxy::getNativeImageObj(acc), plane,
-        sizeof(T) * n, x, y, temp);
+        sizeof(T) * n, x, y, temp.data());
   } else {
     __esimd_media_block_store<T, m, n>(
         0, detail::AccessorPrivateProxy::getNativeImageObj(acc), plane,
-        sizeof(T) * n, x, y, vals);
+        sizeof(T) * n, x, y, vals.data());
   }
 #else
-  __esimd_media_block_store<T, m, n>(0, acc, plane, sizeof(T) * n, x, y, vals);
+  __esimd_media_block_store<T, m, n>(0, acc, plane, sizeof(T) * n, x, y,
+                                     vals.data());
 #endif // __SYCL_DEVICE_ONLY__
 }
 
@@ -902,7 +904,7 @@ esimd_raw_sends_load(simd<T1, n1> msgDst, simd<T2, n2> msgSrc0,
                      simd<T3, n3> msgSrc1, uint32_t exDesc, uint32_t msgDesc,
                      uint8_t execSize, uint8_t sfid, uint8_t numSrc0,
                      uint8_t numSrc1, uint8_t numDst, uint8_t isEOT = 0,
-                     uint8_t isSendc = 0, simd<uint16_t, N> mask = 1) {
+                     uint8_t isSendc = 0, simd_mask<N> mask = 1) {
   constexpr unsigned _Width1 = n1 * sizeof(T1);
   static_assert(_Width1 % 32 == 0, "Invalid size for raw send rspVar");
   constexpr unsigned _Width2 = n2 * sizeof(T2);
@@ -942,7 +944,7 @@ ESIMD_INLINE ESIMD_NODEBUG simd<T1, n1>
 esimd_raw_send_load(simd<T1, n1> msgDst, simd<T2, n2> msgSrc0, uint32_t exDesc,
                     uint32_t msgDesc, uint8_t execSize, uint8_t sfid,
                     uint8_t numSrc0, uint8_t numDst, uint8_t isEOT = 0,
-                    uint8_t isSendc = 0, simd<uint16_t, N> mask = 1) {
+                    uint8_t isSendc = 0, simd_mask<N> mask = 1) {
   constexpr unsigned _Width1 = n1 * sizeof(T1);
   static_assert(_Width1 % 32 == 0, "Invalid size for raw send rspVar");
   constexpr unsigned _Width2 = n2 * sizeof(T2);
@@ -980,7 +982,7 @@ esimd_raw_sends_store(simd<T1, n1> msgSrc0, simd<T2, n2> msgSrc1,
                       uint32_t exDesc, uint32_t msgDesc, uint8_t execSize,
                       uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1,
                       uint8_t isEOT = 0, uint8_t isSendc = 0,
-                      simd<uint16_t, N> mask = 1) {
+                      simd_mask<N> mask = 1) {
   constexpr unsigned _Width1 = n1 * sizeof(T1);
   static_assert(_Width1 % 32 == 0, "Invalid size for raw send msgSrc0");
   constexpr unsigned _Width2 = n2 * sizeof(T2);
@@ -1014,7 +1016,7 @@ ESIMD_INLINE ESIMD_NODEBUG void
 esimd_raw_send_store(simd<T1, n1> msgSrc0, uint32_t exDesc, uint32_t msgDesc,
                      uint8_t execSize, uint8_t sfid, uint8_t numSrc0,
                      uint8_t isEOT = 0, uint8_t isSendc = 0,
-                     simd<uint16_t, N> mask = 1) {
+                     simd_mask<N> mask = 1) {
   constexpr unsigned _Width1 = n1 * sizeof(T1);
   static_assert(_Width1 % 32 == 0, "Invalid size for raw send msgSrc0");
 
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/simd.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/simd.hpp
index 26abc25916ae0..bbab78bdd103f 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/simd.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/simd.hpp
@@ -10,12 +10,18 @@
 
 #pragma once
 
-#include <sycl/ext/intel/experimental/esimd/simd_view.hpp>
+#include <sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp>
+#include <sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp>
 
 #include <sycl/ext/intel/experimental/esimd/detail/intrin.hpp>
 #include <sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp>
 #include <sycl/ext/intel/experimental/esimd/detail/sycl_util.hpp>
 #include <sycl/ext/intel/experimental/esimd/detail/types.hpp>
+#include <sycl/ext/intel/experimental/esimd/simd_view.hpp>
+
+#ifndef __SYCL_DEVICE_ONLY__
+#include <iostream>
+#endif // __SYCL_DEVICE_ONLY__
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
@@ -32,654 +38,96 @@ namespace esimd {
 /// read-update-write semantics.
 ///
 /// \ingroup sycl_esimd
-template <typename Ty, int N> class simd {
-  template <typename, typename> friend class simd_view;
+template <typename Ty, int N>
+class simd
+    : public detail::simd_obj_impl<
+          Ty, N, simd<Ty, N>, std::enable_if_t<detail::is_vectorizable_v<Ty>>> {
+  using base_type = detail::simd_obj_impl<Ty, N, simd<Ty, N>>;
 
 public:
-  /// The underlying builtin data type.
-  using vector_type = detail::vector_type_t<Ty, N>;
-
-  /// The element type of this simd object.
-  using element_type = Ty;
-
-  /// The number of elements in this simd object.
+  using base_type::base_type;
+  using element_type = typename base_type::element_type;
+  using vector_type = typename base_type::vector_type;
   static constexpr int length = N;
 
-  /// @{
-  /// Constructors.
-  simd() = default;
-  simd(const simd &other) { set(other.data()); }
-  template <typename SrcTy> simd(const simd<SrcTy, N> &other) {
-    set(__builtin_convertvector(other.data(), detail::vector_type_t<Ty, N>));
-  }
-  template <typename SrcTy> simd(simd<SrcTy, N> &&other) {
-    if constexpr (std::is_same<SrcTy, Ty>::value)
-      set(other.data());
-    else
-      set(__builtin_convertvector(other.data(), detail::vector_type_t<Ty, N>));
-  }
-  simd(const vector_type &Val) { set(Val); }
-
-  // TODO @rolandschulz
-  // {quote}
-  // Providing both an overload of initializer-list and the same type itself
-  // causes really weird behavior. E.g.
-  //   simd s1(1,2); //calls next constructor
-  //   simd s2{1,2}; //calls this constructor
-  // This might not be confusing for all users but to everyone using
-  // uniform-initialization syntax. Therefore if you want to use this
-  // constructor the other one should have a special type (see
-  // https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#es64-use-the-tenotation-for-construction)
-  // to avoid this issue. Also this seems like one of those areas where this
-  // simd-type needless differs from std::simd. Why should these constructors be
-  // different? Why reinvent the wheel and have all the work of fixing these
-  // problems if we could just use the existing solution. Especially if that is
-  // anyhow the long-term goal. Adding extra stuff like the select is totally
-  // fine. But differ on things which have no apparent advantage and aren't as
-  // thought through seems to have only downsides.
-  // {/quote}
-
-  simd(std::initializer_list<Ty> Ilist) noexcept {
-    int i = 0;
-    for (auto It = Ilist.begin(); It != Ilist.end() && i < N; ++It) {
-      M_data[i++] = *It;
-    }
-  }
-
-  /// Initialize a simd with an initial value and step.
-  simd(Ty Val, Ty Step = Ty()) noexcept {
-    if (Step == Ty())
-      M_data = Val;
-    else {
-#pragma unroll
-      for (int i = 0; i < N; ++i) {
-        M_data[i] = Val;
-        Val += Step;
-      }
-    }
-  }
-  /// @}
-
-  /// conversion operator
-  operator const vector_type &() const & { return M_data; }
-  operator vector_type &() & { return M_data; }
-
-  /// Implicit conversion for simd<T, 1> into T.
-  template <typename T = simd,
-            typename = sycl::detail::enable_if_t<T::length == 1>>
-  operator element_type() const {
-    return data()[0];
-  }
-
-  vector_type data() const {
-#ifndef __SYCL_DEVICE_ONLY__
-    return M_data;
-#else
-    return __esimd_vload<Ty, N>(&M_data);
-#endif
-  }
-
-  /// Whole region read.
-  simd read() const { return data(); }
-
-  /// Whole region write.
-  simd &write(const simd &Val) {
-    set(Val.data());
-    return *this;
-  }
-
-  /// Whole region update with predicates.
-  void merge(const simd &Val, const mask_type_t<N> &Mask) {
-    set(__esimd_wrregion<element_type, N, N, 0 /*VS*/, N, 1, N>(
-        data(), Val.data(), 0, Mask));
-  }
-  void merge(const simd &Val1, simd Val2, const mask_type_t<N> &Mask) {
-    Val2.merge(Val1, Mask);
-    set(Val2.data());
-  }
-
-  /// View this simd object in a different element type.
-  template <typename EltTy> auto bit_cast_view() &[[clang::lifetimebound]] {
-    using TopRegionTy = detail::compute_format_type_t<simd, EltTy>;
-    using RetTy = simd_view<simd, TopRegionTy>;
-    TopRegionTy R(0);
-    return RetTy{*this, R};
+  // Implicit conversion constructor from another simd object of the same
+  // length.
+  template <typename SimdT,
+            class = std::enable_if_t<__SEIEED::is_simd_type_v<SimdT> &&
+                                     (length == SimdT::length)>>
+  simd(const SimdT &RHS)
+      : base_type(__builtin_convertvector(RHS.data(), vector_type)) {
+    __esimd_dbg_print(simd(const SimdT &RHS));
   }
 
-  template <typename EltTy>
-  __SYCL_DEPRECATED("use simd::bit_cast_view.")
-  auto format() & {
-    return bit_cast_view<EltTy>();
+  // Broadcast constructor with conversion.
+  template <typename T1,
+            class = std::enable_if_t<detail::is_vectorizable_v<T1>>>
+  simd(T1 Val) : base_type((Ty)Val) {
+    __esimd_dbg_print(simd(T1 Val));
   }
 
-  /// View as a 2-dimensional simd_view.
-  template <typename EltTy, int Height, int Width>
-  auto bit_cast_view() &[[clang::lifetimebound]] {
-    using TopRegionTy =
-        detail::compute_format_type_2d_t<simd, EltTy, Height, Width>;
-    using RetTy = simd_view<simd, TopRegionTy>;
-    TopRegionTy R(0, 0);
-    return RetTy{*this, R};
+  /// Explicit conversion for simd_obj_impl<T, 1> into T.
+  template <class To, class T = simd,
+            class = sycl::detail::enable_if_t<(T::length == 1) &&
+                                              detail::is_vectorizable_v<To>>>
+  operator To() const {
+    __esimd_dbg_print(explicit operator To());
+    return (To)base_type::data()[0];
   }
 
-  template <typename EltTy, int Height, int Width>
-  __SYCL_DEPRECATED("use simd::bit_cast_view.")
-  auto format() & {
-    return bit_cast_view<EltTy, Height, Width>();
-  }
-
-  /// 1D region select, apply a region on top of this LValue object.
-  ///
-  /// \tparam Size is the number of elements to be selected.
-  /// \tparam Stride is the element distance between two consecutive elements.
-  /// \param Offset is the starting element offset.
-  /// \return the representing region object.
-  template <int Size, int Stride>
-  simd_view<simd, region1d_t<Ty, Size, Stride>> select(uint16_t Offset = 0) &[
-      [clang::lifetimebound]] {
-    region1d_t<Ty, Size, Stride> Reg(Offset);
-    return {*this, Reg};
-  }
-
-  /// 1D region select, apply a region on top of this RValue object.
-  ///
-  /// \tparam Size is the number of elements to be selected.
-  /// \tparam Stride is the element distance between two consecutive elements.
-  /// \param Offset is the starting element offset.
-  /// \return the value this region object refers to.
-  template <int Size, int Stride>
-  simd<Ty, Size> select(uint16_t Offset = 0) && {
-    simd<Ty, N> &&Val = *this;
-    return __esimd_rdregion<Ty, N, Size, /*VS*/ 0, Size, Stride>(Val.data(),
-                                                                 Offset);
-  }
-
-  /// Read single element, return value only (not reference).
-  Ty operator[](int i) const { return data()[i]; }
-
-  /// Read single element, return value only (not reference).
-  __SYCL_DEPRECATED("use operator[] form.")
-  Ty operator()(int i) const { return data()[i]; }
-
-  /// Return writable view of a single element.
-  simd_view<simd, region1d_t<Ty, 1, 0>> operator[](int i)
-      [[clang::lifetimebound]] {
-    return select<1, 0>(i);
-  }
-
-  /// Return writable view of a single element.
-  __SYCL_DEPRECATED("use operator[] form.")
-  simd_view<simd, region1d_t<Ty, 1, 0>> operator()(int i) {
-    return select<1, 0>(i);
-  }
-
-  // TODO ESIMD_EXPERIMENTAL
-  /// Read multiple elements by their indices in vector
-  template <int Size>
-  simd<Ty, Size> iselect(const simd<uint16_t, Size> &Indices) {
-    detail::vector_type_t<uint16_t, Size> Offsets = Indices.data() * sizeof(Ty);
-    return __esimd_rdindirect<Ty, N, Size>(data(), Offsets);
-  }
-  // TODO ESIMD_EXPERIMENTAL
-  /// update single element
-  void iupdate(ushort Index, Ty V) {
-    auto Val = data();
-    Val[Index] = V;
-    set(Val);
-  }
-  // TODO ESIMD_EXPERIMENTAL
-  /// update multiple elements by their indices in vector
-  template <int Size>
-  void iupdate(const simd<uint16_t, Size> &Indices, const simd<Ty, Size> &Val,
-               mask_type_t<Size> Mask) {
-    detail::vector_type_t<uint16_t, Size> Offsets = Indices.data() * sizeof(Ty);
-    set(__esimd_wrindirect<Ty, N, Size>(data(), Val.data(), Offsets, Mask));
-  }
-
-  // Use auto as a return type to honor C++ integer promotion rules,
-  // e.g. simd<short,4>  +  simd<short,4>  ->  simd<int,4>
-#define DEF_BINOP(BINOP, OPASSIGN)                                             \
-  ESIMD_INLINE friend auto operator BINOP(const simd &X, const simd &Y) {      \
-    using ComputeTy = detail::compute_type_t<simd>;                            \
-    auto V0 = detail::convert<typename ComputeTy::vector_type>(X.data());      \
-    auto V1 = detail::convert<typename ComputeTy::vector_type>(Y.data());      \
-    auto V2 = V0 BINOP V1;                                                     \
-    return ComputeTy(V2);                                                      \
-  }                                                                            \
-  template <typename T1, typename T = simd,                                    \
-            typename = sycl::detail::enable_if_t<T::length == 1 &&             \
-                                                 std::is_arithmetic_v<T1>>>    \
-  ESIMD_INLINE friend auto operator BINOP(const simd &X, T1 Y) {               \
-    return X BINOP simd((Ty)Y);                                                \
-  }                                                                            \
-  ESIMD_INLINE friend simd &operator OPASSIGN(simd &LHS, const simd &RHS) {    \
-    using ComputeTy = detail::compute_type_t<simd>;                            \
-    auto V0 = detail::convert<typename ComputeTy::vector_type>(LHS.data());    \
-    auto V1 = detail::convert<typename ComputeTy::vector_type>(RHS.data());    \
-    auto V2 = V0 BINOP V1;                                                     \
-    LHS.write(detail::convert<vector_type>(V2));                               \
-    return LHS;                                                                \
-  }                                                                            \
-  ESIMD_INLINE friend simd &operator OPASSIGN(simd &LHS, const Ty &RHS) {      \
-    LHS OPASSIGN simd(RHS);                                                    \
-    return LHS;                                                                \
-  }
-
-  DEF_BINOP(+, +=)
-  DEF_BINOP(-, -=)
-  DEF_BINOP(*, *=)
-  DEF_BINOP(/, /=)
-  DEF_BINOP(%, %=)
-
-#undef DEF_BINOP
-
-  // TODO @rolandschulz, @mattkretz
-  // Introduce simd_mask type and let user use this type instead of specific
-  // type representation (simd<uint16_t, N>) to make it more portable
-  // TODO @iburyl should be mask_type_t, which might become more abstracted in
-  // the future revisions.
-  //
-#define DEF_RELOP(RELOP)                                                       \
-  ESIMD_INLINE friend simd<uint16_t, N> operator RELOP(const simd &X,          \
-                                                       const simd &Y) {        \
-    auto R = X.data() RELOP Y.data();                                          \
-    mask_type_t<N> M(1);                                                       \
-    return M & detail::convert<mask_type_t<N>>(R);                             \
-  }                                                                            \
-  template <typename T1, typename T = simd,                                    \
-            typename = sycl::detail::enable_if_t<(T::length == 1) &&           \
-                                                 std::is_arithmetic_v<T1>>>    \
-  ESIMD_INLINE friend bool operator RELOP(const simd &X, T1 Y) {               \
-    return (Ty)X RELOP(Ty) Y;                                                  \
-  }
-
-  DEF_RELOP(>)
-  DEF_RELOP(>=)
-  DEF_RELOP(<)
-  DEF_RELOP(<=)
-  DEF_RELOP(==)
-  DEF_RELOP(!=)
-
-#undef DEF_RELOP
-
-#define DEF_BITWISE_OP(BITWISE_OP, OPASSIGN)                                   \
-  ESIMD_INLINE friend simd operator BITWISE_OP(const simd &X, const simd &Y) { \
-    static_assert(std::is_integral<Ty>(), "not integeral type");               \
-    auto V2 = X.data() BITWISE_OP Y.data();                                    \
-    return simd(V2);                                                           \
-  }                                                                            \
-  ESIMD_INLINE friend simd &operator OPASSIGN(simd &LHS, const simd &RHS) {    \
-    static_assert(std::is_integral<Ty>(), "not integeral type");               \
-    auto V2 = LHS.data() BITWISE_OP RHS.data();                                \
-    LHS.write(detail::convert<vector_type>(V2));                               \
-    return LHS;                                                                \
-  }                                                                            \
-  ESIMD_INLINE friend simd &operator OPASSIGN(simd &LHS, const Ty &RHS) {      \
-    LHS OPASSIGN simd(RHS);                                                    \
-    return LHS;                                                                \
-  }
-
-  DEF_BITWISE_OP(&, &=)
-  DEF_BITWISE_OP(|, |=)
-  DEF_BITWISE_OP(^, ^=)
-  DEF_BITWISE_OP(<<, <<=)
-  DEF_BITWISE_OP(>>, >>=)
-
-#undef DEF_BITWISE_OP
-
-  // Operator ++, --
+  /// @{
+  /// Infix and postfix operators ++, --
   simd &operator++() {
     *this += 1;
     return *this;
   }
+
   simd operator++(int) {
-    simd Ret(*this);
+    simd Ret(base_type::data());
     operator++();
     return Ret;
   }
+
   simd &operator--() {
     *this -= 1;
     return *this;
   }
+
   simd operator--(int) {
-    simd Ret(*this);
+    simd Ret(base_type::data());
     operator--();
     return Ret;
   }
+  /// @}
 
-#define DEF_UNARY_OP(UNARY_OP)                                                 \
-  simd operator UNARY_OP() {                                                   \
-    auto V = UNARY_OP(data());                                                 \
-    return simd(V);                                                            \
-  }
-  DEF_UNARY_OP(~)
-  DEF_UNARY_OP(+)
-  DEF_UNARY_OP(-)
-
-#undef DEF_UNARY_OP
-
-  // negation operator
-  auto operator!() { return *this == 0; }
-
-  /// \name Replicate
-  /// Replicate simd instance given a region.
-  /// @{
-  ///
-
-  /// \tparam Rep is number of times region has to be replicated.
-  /// \return replicated simd instance.
-  template <int Rep> simd<Ty, Rep * N> replicate() {
-    return replicate<Rep, N>(0);
-  }
-
-  /// \tparam Rep is number of times region has to be replicated.
-  /// \tparam W is width of src region to replicate.
-  /// \param Offset is offset in number of elements in src region.
-  /// \return replicated simd instance.
-  template <int Rep, int W>
-  __SYCL_DEPRECATED("use simd::replicate_w")
-  simd<Ty, Rep * W> replicate(uint16_t Offset) {
-    return replicate_w<Rep, W>(Offset);
-  }
-
-  /// \tparam Rep is number of times region has to be replicated.
-  /// \tparam W is width of src region to replicate.
-  /// \param Offset is offset in number of elements in src region.
-  /// \return replicated simd instance.
-  template <int Rep, int W> simd<Ty, Rep * W> replicate_w(uint16_t Offset) {
-    return replicate_vs_w_hs<Rep, 0, W, 1>(Offset);
-  }
-
-  /// \tparam Rep is number of times region has to be replicated.
-  /// \tparam VS vertical stride of src region to replicate.
-  /// \tparam W is width of src region to replicate.
-  /// \param Offset is offset in number of elements in src region.
-  /// \return replicated simd instance.
-  template <int Rep, int VS, int W>
-  __SYCL_DEPRECATED("use simd::replicate_vs_w")
-  simd<Ty, Rep * W> replicate(uint16_t Offset) {
-    return replicate_vs_w<Rep, VS, W>(Offset);
-  }
-
-  /// \tparam Rep is number of times region has to be replicated.
-  /// \tparam VS vertical stride of src region to replicate.
-  /// \tparam W width of src region to replicate.
-  /// \param Offset offset in number of elements in src region.
-  /// \return replicated simd instance.
-  template <int Rep, int VS, int W>
-  simd<Ty, Rep * W> replicate_vs_w(uint16_t Offset) {
-    return replicate_vs_w_hs<Rep, VS, W, 1>(Offset);
-  }
-
-  /// \tparam Rep is number of times region has to be replicated.
-  /// \tparam VS vertical stride of src region to replicate.
-  /// \tparam W is width of src region to replicate.
-  /// \tparam HS horizontal stride of src region to replicate.
-  /// \param Offset is offset in number of elements in src region.
-  /// \return replicated simd instance.
-  template <int Rep, int VS, int W, int HS>
-  __SYCL_DEPRECATED("use simd::replicate_vs_w_hs")
-  simd<Ty, Rep * W> replicate(uint16_t Offset) {
-    return replicate_vs_w_hs<Rep, VS, W, HS>(Offset);
-  }
-
-  /// \tparam Rep is number of times region has to be replicated.
-  /// \tparam VS vertical stride of src region to replicate.
-  /// \tparam W is width of src region to replicate.
-  /// \tparam HS horizontal stride of src region to replicate.
-  /// \param Offset is offset in number of elements in src region.
-  /// \return replicated simd instance.
-  template <int Rep, int VS, int W, int HS>
-  simd<Ty, Rep * W> replicate_vs_w_hs(uint16_t Offset) {
-    return __esimd_rdregion<element_type, N, Rep * W, VS, W, HS, N>(
-        data(), Offset * sizeof(Ty));
-  }
-  ///@}
-
-  /// Any operation.
-  ///
-  /// \return 1 if any element is set, 0 otherwise.
-  template <
-      typename T1 = element_type, typename T2 = Ty,
-      typename = sycl::detail::enable_if_t<std::is_integral<T1>::value, T2>>
-  uint16_t any() {
-    return __esimd_any<Ty, N>(data());
-  }
-
-  /// All operation.
-  ///
-  /// \return 1 if all elements are set, 0 otherwise.
-  template <
-      typename T1 = element_type, typename T2 = Ty,
-      typename = sycl::detail::enable_if_t<std::is_integral<T1>::value, T2>>
-  uint16_t all() {
-    return __esimd_all<Ty, N>(data());
-  }
-
-  /// Write a simd-vector into a basic region of a simd object.
-  template <typename RTy>
-  ESIMD_INLINE void
-  writeRegion(RTy Region,
-              const detail::vector_type_t<typename RTy::element_type,
-                                          RTy::length> &Val) {
-    using ElemTy = typename RTy::element_type;
-    if constexpr (N * sizeof(Ty) == RTy::length * sizeof(ElemTy))
-      // update the entire vector
-      set(detail::bitcast<Ty, ElemTy, RTy::length>(Val));
-    else {
-      static_assert(!RTy::Is_2D);
-      // If element type differs, do bitcast conversion first.
-      auto Base = detail::bitcast<ElemTy, Ty, N>(data());
-      constexpr int BN = (N * sizeof(Ty)) / sizeof(ElemTy);
-      // Access the region information.
-      constexpr int M = RTy::Size_x;
-      constexpr int Stride = RTy::Stride_x;
-      uint16_t Offset = Region.M_offset_x * sizeof(ElemTy);
-
-      // Merge and update.
-      auto Merged = __esimd_wrregion<ElemTy, BN, M,
-                                     /*VS*/ 0, M, Stride>(Base, Val, Offset);
-      // Convert back to the original element type, if needed.
-      set(detail::bitcast<Ty, ElemTy, BN>(Merged));
-    }
-  }
-
-  /// Write a simd-vector into a nested region of a simd object.
-  template <typename TR, typename UR>
-  ESIMD_INLINE void writeRegion(
-      std::pair<TR, UR> Region,
-      const detail::vector_type_t<typename TR::element_type, TR::length> &Val) {
-    // parent-region type
-    using PaTy = typename shape_type<UR>::type;
-    using ElemTy = typename TR::element_type;
-    using BT = typename PaTy::element_type;
-    constexpr int BN = PaTy::length;
-
-    if constexpr (PaTy::Size_in_bytes == TR::Size_in_bytes) {
-      writeRegion(Region.second, detail::bitcast<BT, ElemTy, TR::length>(Val));
-    } else {
-      // Recursively read the base
-      auto Base = detail::readRegion<Ty, N>(data(), Region.second);
-      // If element type differs, do bitcast conversion first.
-      auto Base1 = detail::bitcast<ElemTy, BT, BN>(Base);
-      constexpr int BN1 = PaTy::Size_in_bytes / sizeof(ElemTy);
-
-      if constexpr (!TR::Is_2D) {
-        // Access the region information.
-        constexpr int M = TR::Size_x;
-        constexpr int Stride = TR::Stride_x;
-        uint16_t Offset = Region.first.M_offset_x * sizeof(ElemTy);
-
-        // Merge and update.
-        Base1 = __esimd_wrregion<ElemTy, BN1, M,
-                                 /*VS*/ 0, M, Stride>(Base1, Val, Offset);
-      } else {
-        static_assert(std::is_same<ElemTy, BT>::value);
-        // Read columns with non-trivial horizontal stride.
-        constexpr int M = TR::length;
-        constexpr int VS = PaTy::Size_x * TR::Stride_y;
-        constexpr int W = TR::Size_x;
-        constexpr int HS = TR::Stride_x;
-        constexpr int ParentWidth = PaTy::Size_x;
-
-        // Compute the byte offset for the starting element.
-        uint16_t Offset = static_cast<uint16_t>(
-            (Region.first.M_offset_y * PaTy::Size_x + Region.first.M_offset_x) *
-            sizeof(ElemTy));
-
-        // Merge and update.
-        Base1 = __esimd_wrregion<ElemTy, BN1, M, VS, W, HS, ParentWidth>(
-            Base1, Val, Offset);
-      }
-      // Convert back to the original element type, if needed.
-      auto Merged1 = detail::bitcast<BT, ElemTy, BN1>(Base1);
-      // recursively write it back to the base
-      writeRegion(Region.second, Merged1);
-    }
+#define __ESIMD_DEF_SIMD_ARITH_UNARY_OP(ARITH_UNARY_OP)                        \
+  template <class T1 = Ty> simd operator ARITH_UNARY_OP() {                    \
+    static_assert(!std::is_unsigned_v<T1>,                                     \
+                  #ARITH_UNARY_OP "doesn't apply to unsigned types");          \
+    return simd(ARITH_UNARY_OP(base_type::data()));                            \
   }
 
-  /// @name Memory operations
-  /// TODO NOTE: These APIs do not support cache hint specification yet, as this
-  /// is WIP. Later addition of hints is not expected to break code using these
-  /// APIs.
-  ///
-  /// @{
-
-  /// Copy a contiguous block of data from memory into this simd object.
-  /// The amount of memory copied equals the total size of vector elements in
-  /// this object.
-  /// @param addr the memory address to copy from. Must be a pointer to the
-  /// global address space, otherwise behavior is undefined.
-  ESIMD_INLINE void copy_from(const Ty *const addr) SYCL_ESIMD_FUNCTION;
-
-  /// Copy a contiguous block of data from memory into this simd object.
-  /// The amount of memory copied equals the total size of vector elements in
-  /// this object.
-  /// Source memory location is represented via a global accessor and offset.
-  /// @param acc accessor to copy from.
-  /// @param offset offset to copy from.
-  template <typename AccessorT>
-  ESIMD_INLINE
-      detail::EnableIfAccessor<AccessorT, detail::accessor_mode_cap::can_read,
-                               sycl::access::target::global_buffer, void>
-      copy_from(AccessorT acc, uint32_t offset) SYCL_ESIMD_FUNCTION;
-
-  /// Copy all vector elements of this object into a contiguous block in memory.
-  /// @param addr the memory address to copy to. Must be a pointer to the
-  /// global address space, otherwise behavior is undefined.
-  ESIMD_INLINE void copy_to(Ty *addr) SYCL_ESIMD_FUNCTION;
-
-  /// Copy all vector elements of this object into a contiguous block in memory.
-  /// Destination memory location is represented via a global accessor and
-  /// offset.
-  /// @param acc accessor to copy from.
-  /// @param offset offset to copy from.
-  template <typename AccessorT>
-  ESIMD_INLINE
-      detail::EnableIfAccessor<AccessorT, detail::accessor_mode_cap::can_write,
-                               sycl::access::target::global_buffer, void>
-      copy_to(AccessorT acc, uint32_t offset) SYCL_ESIMD_FUNCTION;
-
-  /// @} // Memory operations
-private:
-  // The underlying data for this vector.
-  vector_type M_data;
-
-  void set(const vector_type &Val) {
-#ifndef __SYCL_DEVICE_ONLY__
-    M_data = Val;
-#else
-    __esimd_vstore<Ty, N>(&M_data, Val);
-#endif
-  }
+  __ESIMD_DEF_SIMD_ARITH_UNARY_OP(-)
+  __ESIMD_DEF_SIMD_ARITH_UNARY_OP(+)
+#undef __ESIMD_DEF_SIMD_ARITH_UNARY_OP
 };
 
-template <typename U, typename T, int n>
-ESIMD_INLINE simd<U, n> convert(simd<T, n> val) {
-  return __builtin_convertvector(val.data(), detail::vector_type_t<U, n>);
+/// Covert from a simd object with element type \c From to a simd object with
+/// element type \c To.
+template <typename To, typename From, int N>
+ESIMD_INLINE simd<To, N> convert(const simd<From, N> &val) {
+  if constexpr (std::is_same_v<To, From>)
+    return val;
+  else
+    return __builtin_convertvector(val.data(), detail::vector_type_t<To, N>);
 }
 
-// ----------- Outlined implementations of esimd class APIs.
-
-template <typename T, int N> void simd<T, N>::copy_from(const T *const Addr) {
-  constexpr unsigned Sz = sizeof(T) * N;
-  static_assert(Sz >= detail::OperandSize::OWORD,
-                "block size must be at least 1 oword");
-  static_assert(Sz % detail::OperandSize::OWORD == 0,
-                "block size must be whole number of owords");
-  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
-                "block must be 1, 2, 4 or 8 owords long");
-  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
-                "block size must be at most 8 owords");
+#undef __ESIMD_DEF_RELOP
+#undef __ESIMD_DEF_BITWISE_OP
 
-  uintptr_t AddrVal = reinterpret_cast<uintptr_t>(Addr);
-  *this =
-      __esimd_flat_block_read_unaligned<T, N, CacheHint::None, CacheHint::None>(
-          AddrVal);
-}
-
-template <typename T, int N>
-template <typename AccessorT>
-ESIMD_INLINE
-    detail::EnableIfAccessor<AccessorT, detail::accessor_mode_cap::can_read,
-                             sycl::access::target::global_buffer, void>
-    simd<T, N>::copy_from(AccessorT acc, uint32_t offset) {
-  constexpr unsigned Sz = sizeof(T) * N;
-  static_assert(Sz >= detail::OperandSize::OWORD,
-                "block size must be at least 1 oword");
-  static_assert(Sz % detail::OperandSize::OWORD == 0,
-                "block size must be whole number of owords");
-  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
-                "block must be 1, 2, 4 or 8 owords long");
-  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
-                "block size must be at most 8 owords");
-#if defined(__SYCL_DEVICE_ONLY__)
-  auto surf_ind = detail::AccessorPrivateProxy::getNativeImageObj(acc);
-  *this = __esimd_block_read<T, N>(surf_ind, offset);
-#else
-  *this = __esimd_block_read<T, N>(acc, offset);
-#endif // __SYCL_DEVICE_ONLY__
-}
-
-template <typename T, int N> void simd<T, N>::copy_to(T *addr) {
-  constexpr unsigned Sz = sizeof(T) * N;
-  static_assert(Sz >= detail::OperandSize::OWORD,
-                "block size must be at least 1 oword");
-  static_assert(Sz % detail::OperandSize::OWORD == 0,
-                "block size must be whole number of owords");
-  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
-                "block must be 1, 2, 4 or 8 owords long");
-  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
-                "block size must be at most 8 owords");
-
-  uintptr_t AddrVal = reinterpret_cast<uintptr_t>(addr);
-  __esimd_flat_block_write<T, N, CacheHint::None, CacheHint::None>(AddrVal,
-                                                                   data());
-}
-
-template <typename T, int N>
-template <typename AccessorT>
-ESIMD_INLINE
-    detail::EnableIfAccessor<AccessorT, detail::accessor_mode_cap::can_write,
-                             sycl::access::target::global_buffer, void>
-    simd<T, N>::copy_to(AccessorT acc, uint32_t offset) {
-  constexpr unsigned Sz = sizeof(T) * N;
-  static_assert(Sz >= detail::OperandSize::OWORD,
-                "block size must be at least 1 oword");
-  static_assert(Sz % detail::OperandSize::OWORD == 0,
-                "block size must be whole number of owords");
-  static_assert(detail::isPowerOf2(Sz / detail::OperandSize::OWORD),
-                "block must be 1, 2, 4 or 8 owords long");
-  static_assert(Sz <= 8 * detail::OperandSize::OWORD,
-                "block size must be at most 8 owords");
-
-#if defined(__SYCL_DEVICE_ONLY__)
-  auto surf_ind = detail::AccessorPrivateProxy::getNativeImageObj(acc);
-  __esimd_block_write<T, N>(surf_ind, offset >> 4, data());
-#else
-  __esimd_block_write<T, N>(acc, offset >> 4, data());
-#endif // __SYCL_DEVICE_ONLY__
-}
+/// Represents a simd mask.
+template <int N> using simd_mask = detail::simd_mask_type<N>;
 
 } // namespace esimd
 } // namespace experimental
@@ -688,11 +136,12 @@ ESIMD_INLINE
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
 
-#ifndef __SYCL_DEVICE_ONLY__
 template <typename Ty, int N>
-std::ostream &
-operator<<(std::ostream &OS,
-           const sycl::ext::intel::experimental::esimd::simd<Ty, N> &V) {
+std::ostream &operator<<(std::ostream &OS, const __SEIEE::simd<Ty, N> &V)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else
+{
   OS << "{";
   for (int I = 0; I < N; I++) {
     OS << V[I];
@@ -702,5 +151,4 @@ operator<<(std::ostream &OS,
   OS << "}";
   return OS;
 }
-
-#endif
+#endif // __SYCL_DEVICE_ONLY__
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/simd_view.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/simd_view.hpp
index 51751e3cca65f..a6d18fcde31a4 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/simd_view.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/simd_view.hpp
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp>
+#include <sycl/ext/intel/experimental/esimd/detail/types.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
@@ -25,143 +26,79 @@ namespace esimd {
 ///
 /// \ingroup sycl_esimd
 template <typename BaseTy, typename RegionTy>
-class simd_view : public detail::simd_view_impl<BaseTy, RegionTy,
-                                                simd_view<BaseTy, RegionTy>> {
+class simd_view : public detail::simd_view_impl<BaseTy, RegionTy> {
+  template <typename, int, class, class> friend class detail::simd_obj_impl;
+  template <typename, int> friend class detail::simd_mask_impl;
+  template <typename, typename> friend class simd_view;
   template <typename, int> friend class simd;
-  template <typename, typename, typename> friend class detail::simd_view_impl;
+  template <typename, typename> friend class detail::simd_view_impl;
 
 public:
-  using BaseClass =
-      detail::simd_view_impl<BaseTy, RegionTy, simd_view<BaseTy, RegionTy>>;
+  static_assert(detail::is_simd_obj_impl_derivative_v<BaseTy>);
+  using BaseClass = detail::simd_view_impl<BaseTy, RegionTy>;
+
+  // Deduce the corresponding value type from its region type.
   using ShapeTy = typename shape_type<RegionTy>::type;
   static constexpr int length = ShapeTy::Size_x * ShapeTy::Size_y;
 
+  using base_type = BaseTy;
+  template <typename ElT, int N>
+  using get_simd_t = typename BaseClass::template get_simd_t<ElT, N>;
+
+  /// The region type of this class.
+  using region_type = RegionTy;
+
+  /// The element type of this class, which could be different from the element
+  /// type of the base object type.
   using element_type = typename ShapeTy::element_type;
 
-  /// The simd type if reading this simd_view object.
-  using value_type = simd<element_type, length>;
+  /// The simd type if reading the object.
+  using value_type = get_simd_t<element_type, length>;
 
-private:
+  /// The underlying builtin value type
+  using vector_type = detail::vector_type_t<element_type, length>;
+
+protected:
+  /// @{
+  /// Constructors.
   simd_view(BaseTy &Base, RegionTy Region) : BaseClass(Base, Region) {}
   simd_view(BaseTy &&Base, RegionTy Region) : BaseClass(Base, Region) {}
+  /// @}
 
 public:
   // Default copy and move constructors for simd_view.
   simd_view(const simd_view &Other) = default;
   simd_view(simd_view &&Other) = default;
 
-  /// @{
-  /// Assignment operators.
   simd_view &operator=(const simd_view &Other) {
-    *this = Other.read();
-    return *this;
-  }
-  simd_view &operator=(const value_type &Val) {
-    this->M_base.writeRegion(this->M_region, Val.data());
+    BaseClass::operator=(Other);
     return *this;
   }
-  /// @}
 
-  /// Move assignment operator.
-  simd_view &operator=(simd_view &&Other) {
-    *this = Other.read();
-    return *this;
-  }
+  using BaseClass::operator--;
+  using BaseClass::operator++;
+  using BaseClass::operator=;
+};
 
-#define DEF_BINOP(BINOP, OPASSIGN)                                             \
-  ESIMD_INLINE friend auto operator BINOP(const simd_view &X,                  \
-                                          const value_type &Y) {               \
-    using ComputeTy = detail::compute_type_t<value_type>;                      \
-    auto V0 =                                                                  \
-        detail::convert<typename ComputeTy::vector_type>(X.read().data());     \
-    auto V1 = detail::convert<typename ComputeTy::vector_type>(Y.data());      \
-    auto V2 = V0 BINOP V1;                                                     \
-    return ComputeTy(V2);                                                      \
-  }                                                                            \
-  ESIMD_INLINE friend auto operator BINOP(const simd_view &X,                  \
-                                          const element_type &Y) {             \
-    return X BINOP(value_type) Y;                                              \
-  }                                                                            \
-  ESIMD_INLINE friend auto operator BINOP(const element_type &X,               \
-                                          const simd_view &Y) {                \
-    return (value_type)X BINOP Y;                                              \
-  }                                                                            \
-  ESIMD_INLINE friend auto operator BINOP(const value_type &X,                 \
+#define __ESIMD_DEF_SCALAR_SIMD_VIEW_RELOP(RELOP)                              \
+  /* simd_view RELOP simd_view */                                              \
+  ESIMD_INLINE friend bool operator RELOP(const simd_view &X,                  \
                                           const simd_view &Y) {                \
-    using ComputeTy = detail::compute_type_t<value_type>;                      \
-    auto V0 = detail::convert<typename ComputeTy::vector_type>(X.data());      \
-    auto V1 =                                                                  \
-        detail::convert<typename ComputeTy::vector_type>(Y.read().data());     \
-    auto V2 = V0 BINOP V1;                                                     \
-    return ComputeTy(V2);                                                      \
-  }
-
-  DEF_BINOP(+, +=)
-  DEF_BINOP(-, -=)
-  DEF_BINOP(*, *=)
-  DEF_BINOP(/, /=)
-  DEF_BINOP(%, %=)
-
-#undef DEF_BINOP
-
-#define DEF_BITWISE_OP(BITWISE_OP, OPASSIGN)                                   \
-  ESIMD_INLINE friend auto operator BITWISE_OP(const simd_view &X,             \
-                                               const value_type &Y) {          \
-    static_assert(std::is_integral<element_type>(), "not integral type");      \
-    auto V2 = X.read().data() BITWISE_OP Y.data();                             \
-    return simd<element_type, length>(V2);                                     \
-  }                                                                            \
-  ESIMD_INLINE friend auto operator BITWISE_OP(const simd_view &X,             \
-                                               const element_type &Y) {        \
-    return X BITWISE_OP(value_type) Y;                                         \
-  }                                                                            \
-  ESIMD_INLINE friend auto operator BITWISE_OP(const value_type &X,            \
-                                               const simd_view &Y) {           \
-    static_assert(std::is_integral<element_type>(), "not integral type");      \
-    auto V2 = X.data() BITWISE_OP Y.read().data();                             \
-    return simd<element_type, length>(V2);                                     \
-  }
-
-  DEF_BITWISE_OP(&, &=)
-  DEF_BITWISE_OP(|, |=)
-  DEF_BITWISE_OP(^, ^=)
-  DEF_BITWISE_OP(>>, >>=)
-  DEF_BITWISE_OP(<<, <<=)
-
-#undef DEF_BITWISE_OP
-
-#define DEF_RELOP(RELOP)                                                       \
-  ESIMD_INLINE friend simd<uint16_t, length> operator RELOP(                   \
-      const simd_view &X, const value_type &Y) {                               \
-    auto R = X.read().data() RELOP Y.data();                                   \
-    mask_type_t<length> M(1);                                                  \
-    return M & detail::convert<mask_type_t<length>>(R);                        \
-  }                                                                            \
-  ESIMD_INLINE friend simd<uint16_t, length> operator RELOP(                   \
-      const value_type &X, const simd_view &Y) {                               \
-    auto R = X.data() RELOP Y.read().data();                                   \
-    mask_type_t<length> M(1);                                                  \
-    return M & detail::convert<mask_type_t<length>>(R);                        \
+    return (element_type)X RELOP(element_type) Y;                              \
   }                                                                            \
-  ESIMD_INLINE friend simd<uint16_t, length> operator RELOP(                   \
-      const simd_view &X, const element_type &Y) {                             \
-    return X RELOP(value_type) Y;                                              \
+                                                                               \
+  /* simd_view RELOP SCALAR */                                                 \
+  template <typename T1, std::enable_if_t<detail::is_vectorizable_v<T1>>>      \
+  ESIMD_INLINE friend bool operator RELOP(const simd_view &X, T1 Y) {          \
+    return (element_type)X RELOP Y;                                            \
   }                                                                            \
-  ESIMD_INLINE friend simd<uint16_t, length> operator RELOP(                   \
-      const simd_view &X, const simd_view &Y) {                                \
-    return (X RELOP Y.read());                                                 \
+                                                                               \
+  /* SCALAR RELOP simd_view */                                                 \
+  template <typename T1, std::enable_if_t<detail::is_vectorizable_v<T1>>>      \
+  ESIMD_INLINE friend bool operator RELOP(T1 X, const simd_view &Y) {          \
+    return X RELOP(element_type) Y;                                            \
   }
 
-  DEF_RELOP(>)
-  DEF_RELOP(>=)
-  DEF_RELOP(<)
-  DEF_RELOP(<=)
-  DEF_RELOP(==)
-  DEF_RELOP(!=)
-
-#undef DEF_RELOP
-};
-
 /// This is a specialization of simd_view class with a single element.
 /// Objects of such a class are created in the following situation:
 ///   simd<int, 4> v = 1;
@@ -174,24 +111,26 @@ class simd_view : public detail::simd_view_impl<BaseTy, RegionTy,
 ///   bool b = v[0] > v[1] && v[2] < 42;
 ///
 /// \ingroup sycl_esimd
-template <typename BaseTy, typename T, int StrideY, int StrideX>
-class simd_view<BaseTy, region1d_scalar_t<T, StrideY, StrideX>>
-    : public detail::simd_view_impl<
-          BaseTy, region1d_scalar_t<T, StrideY, StrideX>,
-          simd_view<BaseTy, region1d_scalar_t<T, StrideY, StrideX>>> {
-  template <typename, int> friend class simd;
-  template <typename, typename, typename> friend class detail::simd_view_impl;
+template <typename BaseTy, typename T>
+class simd_view<BaseTy, region1d_scalar_t<T>>
+    : public detail::simd_view_impl<BaseTy, region1d_scalar_t<T>> {
+  template <typename, int, class, class> friend class detail::simd_obj_impl;
+  template <typename, typename> friend class detail::simd_view_impl;
 
 public:
-  using RegionTy = region1d_scalar_t<T, StrideY, StrideX>;
-  using BaseClass =
-      detail::simd_view_impl<BaseTy, RegionTy, simd_view<BaseTy, RegionTy>>;
+  using RegionTy = region1d_scalar_t<T>;
+  using BaseClass = detail::simd_view_impl<BaseTy, RegionTy>;
   using ShapeTy = typename shape_type<RegionTy>::type;
   static constexpr int length = ShapeTy::Size_x * ShapeTy::Size_y;
   static_assert(1 == length, "length of this view is not equal to 1");
   /// The element type of this class, which could be different from the element
   /// type of the base object type.
-  using element_type = T;
+  using element_type = typename ShapeTy::element_type;
+  using base_type = BaseTy;
+  template <typename ElT, int N>
+  using get_simd_t = typename BaseClass::template get_simd_t<ElT, N>;
+  /// The simd type if reading the object.
+  using value_type = get_simd_t<element_type, length>;
 
 private:
   simd_view(BaseTy &Base, RegionTy Region) : BaseClass(Base, Region) {}
@@ -203,28 +142,16 @@ class simd_view<BaseTy, region1d_scalar_t<T, StrideY, StrideX>>
     return v[0];
   }
 
+  using BaseClass::operator--;
+  using BaseClass::operator++;
   using BaseClass::operator=;
 
-#define DEF_RELOP(RELOP)                                                       \
-  ESIMD_INLINE friend bool operator RELOP(const simd_view &X,                  \
-                                          const simd_view &Y) {                \
-    return (element_type)X RELOP(element_type) Y;                              \
-  }                                                                            \
-  template <typename T1, typename = sycl::detail::enable_if_t<                 \
-                             detail::is_esimd_scalar<T1>::value &&             \
-                             detail::is_vectorizable_v<T1>::value>>            \
-  ESIMD_INLINE friend bool operator RELOP(const simd_view &X, T1 Y) {          \
-    return (element_type)X RELOP Y;                                            \
-  }
-
-  DEF_RELOP(>)
-  DEF_RELOP(>=)
-  DEF_RELOP(<)
-  DEF_RELOP(<=)
-  DEF_RELOP(==)
-  DEF_RELOP(!=)
-
-#undef DEF_RELOP
+  __ESIMD_DEF_SCALAR_SIMD_VIEW_RELOP(>)
+  __ESIMD_DEF_SCALAR_SIMD_VIEW_RELOP(>=)
+  __ESIMD_DEF_SCALAR_SIMD_VIEW_RELOP(<)
+  __ESIMD_DEF_SCALAR_SIMD_VIEW_RELOP(<=)
+  __ESIMD_DEF_SCALAR_SIMD_VIEW_RELOP(==)
+  __ESIMD_DEF_SCALAR_SIMD_VIEW_RELOP(!=)
 };
 
 // TODO: remove code duplication in two class specializations for a simd_view
@@ -235,23 +162,16 @@ class simd_view<BaseTy, region1d_scalar_t<T, StrideY, StrideX>>
 ///   simd<int, 4> v = 1;
 ///   auto v1 = v.select<2, 1>(0);
 ///   auto v2 = v1[0]; // simd_view of a nested region for a single element
-template <typename BaseTy, typename T, int StrideY, int StrideX,
-          typename NestedRegion>
-class simd_view<BaseTy,
-                std::pair<region1d_scalar_t<T, StrideY, StrideX>, NestedRegion>>
+template <typename BaseTy, typename T, typename NestedRegion>
+class simd_view<BaseTy, std::pair<region1d_scalar_t<T>, NestedRegion>>
     : public detail::simd_view_impl<
-          BaseTy,
-          std::pair<region1d_scalar_t<T, StrideY, StrideX>, NestedRegion>,
-          simd_view<BaseTy, std::pair<region1d_scalar_t<T, StrideY, StrideX>,
-                                      NestedRegion>>> {
+          BaseTy, std::pair<region1d_scalar_t<T>, NestedRegion>> {
   template <typename, int> friend class simd;
-  template <typename, typename, typename> friend class detail::simd_view_impl;
+  template <typename, typename> friend class detail::simd_view_impl;
 
 public:
-  using RegionTy =
-      std::pair<region1d_scalar_t<T, StrideY, StrideX>, NestedRegion>;
-  using BaseClass =
-      detail::simd_view_impl<BaseTy, RegionTy, simd_view<BaseTy, RegionTy>>;
+  using RegionTy = std::pair<region1d_scalar_t<T>, NestedRegion>;
+  using BaseClass = detail::simd_view_impl<BaseTy, RegionTy>;
   using ShapeTy = typename shape_type<RegionTy>::type;
   static constexpr int length = ShapeTy::Size_x * ShapeTy::Size_y;
   static_assert(1 == length, "length of this view is not equal to 1");
@@ -264,35 +184,23 @@ class simd_view<BaseTy,
   simd_view(BaseTy &&Base, RegionTy Region) : BaseClass(Base, Region) {}
 
 public:
+  using BaseClass::operator=;
+
   operator element_type() const {
     const auto v = BaseClass::read();
     return v[0];
   }
 
-  using BaseClass::operator=;
-
-#define DEF_RELOP(RELOP)                                                       \
-  ESIMD_INLINE friend bool operator RELOP(const simd_view &X,                  \
-                                          const simd_view &Y) {                \
-    return (element_type)X RELOP(element_type) Y;                              \
-  }                                                                            \
-  template <typename T1, typename = sycl::detail::enable_if_t<                 \
-                             detail::is_esimd_scalar<T1>::value &&             \
-                             detail::is_vectorizable_v<T1>::value>>            \
-  ESIMD_INLINE friend bool operator RELOP(const simd_view &X, T1 Y) {          \
-    return (element_type)X RELOP Y;                                            \
-  }
-
-  DEF_RELOP(>)
-  DEF_RELOP(>=)
-  DEF_RELOP(<)
-  DEF_RELOP(<=)
-  DEF_RELOP(==)
-  DEF_RELOP(!=)
-
-#undef DEF_RELOP
+  __ESIMD_DEF_SCALAR_SIMD_VIEW_RELOP(>)
+  __ESIMD_DEF_SCALAR_SIMD_VIEW_RELOP(>=)
+  __ESIMD_DEF_SCALAR_SIMD_VIEW_RELOP(<)
+  __ESIMD_DEF_SCALAR_SIMD_VIEW_RELOP(<=)
+  __ESIMD_DEF_SCALAR_SIMD_VIEW_RELOP(==)
+  __ESIMD_DEF_SCALAR_SIMD_VIEW_RELOP(!=)
 };
 
+#undef __ESIMD_DEF_SCALAR_SIMD_VIEW_RELOP
+
 } // namespace esimd
 } // namespace experimental
 } // namespace intel
diff --git a/sycl/test/esimd/esimd-util-compiler-eval.cpp b/sycl/test/esimd/esimd-util-compiler-eval.cpp
index ed03ea3ce81d1..52cac182d32b3 100644
--- a/sycl/test/esimd/esimd-util-compiler-eval.cpp
+++ b/sycl/test/esimd/esimd-util-compiler-eval.cpp
@@ -19,11 +19,8 @@ static_assert(log2<1024 * 1024>() == 20, "");
 
 using BaseTy = simd<float, 4>;
 using RegionTy = region1d_t<float, 2, 1>;
-using RegionTy1 = region1d_scalar_t<float, 0, 0>;
-static_assert(
-    !is_simd_view_v<
-        simd_view_impl<BaseTy, RegionTy, simd_view<BaseTy, RegionTy>>>::value,
-    "");
-static_assert(is_simd_view_v<simd_view<BaseTy, RegionTy>>::value, "");
-static_assert(is_simd_view_v<simd_view<BaseTy, RegionTy1>>::value, "");
-static_assert(!is_simd_view_v<BaseTy>::value, "");
+using RegionTy1 = region1d_scalar_t<float>;
+static_assert(!is_simd_view_type_v<simd_view_impl<BaseTy, RegionTy>>, "");
+static_assert(is_simd_view_type_v<simd_view<BaseTy, RegionTy>>, "");
+static_assert(is_simd_view_type_v<simd_view<BaseTy, RegionTy1>>, "");
+static_assert(!is_simd_view_type_v<BaseTy>, "");
diff --git a/sycl/test/esimd/esimd_math.cpp b/sycl/test/esimd/esimd_math.cpp
index 17420b678fd2c..e5b0b5e7d2d3e 100644
--- a/sycl/test/esimd/esimd_math.cpp
+++ b/sycl/test/esimd/esimd_math.cpp
@@ -8,12 +8,12 @@
 using namespace sycl::ext::intel::experimental::esimd;
 
 bool test_esimd_mask() __attribute__((sycl_device)) {
-  simd<ushort, 16> a(0);
+  simd_mask<16> a(0);
   a.select<4, 1>(4) = 1;
   a.select<4, 1>(12) = 1;
   unsigned int b = esimd_pack_mask(a);
 
-  simd<ushort, 16> c = esimd_unpack_mask<16>(b);
+  simd_mask<16> c = esimd_unpack_mask<16>(b);
 
   unsigned int d = esimd_pack_mask(c);
 
diff --git a/sycl/test/esimd/intrins_trans.cpp b/sycl/test/esimd/intrins_trans.cpp
index 23726115e189e..989ae37b6818b 100644
--- a/sycl/test/esimd/intrins_trans.cpp
+++ b/sycl/test/esimd/intrins_trans.cpp
@@ -42,7 +42,7 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd<float, 16> foo() {
   simd<uint32_t, VL> v1(0, x + z);
   simd<uint64_t, VL> offsets(0, y);
   simd<uintptr_t, VL> v_addr(reinterpret_cast<uintptr_t>(ptr));
-  simd<ushort, VL> pred;
+  simd_mask<VL> pred;
   v_addr += offsets;
 
   __esimd_flat_atomic0<EsimdAtomicOpType::ATOMIC_INC, uint32_t, VL>(
@@ -50,10 +50,10 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd<float, 16> foo() {
   // CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.inc.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef)
 
   __esimd_flat_atomic1<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, VL>(
-      v_addr.data(), v1, pred.data());
+      v_addr.data(), v1.data(), pred.data());
   // CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.add.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef)
   __esimd_flat_atomic2<EsimdAtomicOpType::ATOMIC_CMPXCHG, uint32_t, VL>(
-      v_addr.data(), v1, v1, pred.data());
+      v_addr.data(), v1.data(), v1.data(), pred.data());
   // CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.cmpxchg.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef)
 
   uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
diff --git a/sycl/test/esimd/lane_id.cpp b/sycl/test/esimd/lane_id.cpp
index df6031ac1810f..0f25e7cc4c6e6 100644
--- a/sycl/test/esimd/lane_id.cpp
+++ b/sycl/test/esimd/lane_id.cpp
@@ -26,7 +26,7 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd<int, 16> foo(int x) {
   SIMT_BEGIN(16, lane)
   //CHECK: define internal spir_func void @_ZZ3fooiENKUlvE_clEv({{.*}}) {{.*}} #[[ATTR:[0-9]+]]
   //CHECK: %{{[0-9a-zA-Z_.]+}} = tail call spir_func i32 @_Z15__esimd_lane_idv()
-  v.select<1, 0>(lane) = x++;
+  v.select<1, 1>(lane) = x++;
   SIMT_END
   return v;
 }
diff --git a/sycl/test/esimd/operators.cpp b/sycl/test/esimd/operators.cpp
new file mode 100644
index 0000000000000..6e9334cba2175
--- /dev/null
+++ b/sycl/test/esimd/operators.cpp
@@ -0,0 +1,514 @@
+// RUN: %clangxx -fsycl -fsycl-device-only -fsyntax-only -Xclang -verify %s
+// expected-no-diagnostics
+
+#include <sycl/ext/intel/experimental/esimd.hpp>
+
+using namespace sycl::ext::intel::experimental::esimd;
+
+// dummy use of 'v' - storing it to memory
+#define USE_v *(decltype(v) *)(++out) = v
+
+// --- bitwise
+template <class T1, class T2>
+[[intel::sycl_explicit_simd]] auto
+bitwise_op_test_impl(const simd<T1, 8> &x, simd<T1, 8> &x1,
+                     const simd<T2, 8> &y, simd<T2, 8> &y1,
+                     const simd_mask<8> &m, simd_mask<8> &m1,
+                     simd<long long, 32> *out) {
+  int a = 1;
+
+  // simd ^ simd
+  {
+    auto k = x1 ^= y;
+    auto v = k ^ y;
+    USE_v;
+  }
+  {
+    auto v = x ^ y;
+    USE_v;
+  }
+  // simd ^ SCALAR
+  {
+    auto k = x1 ^= 5;
+    auto v = k ^ y;
+    USE_v;
+  }
+  {
+    auto v = x ^ (T2)5;
+    USE_v;
+  }
+  // SCALAR ^ simd
+  {
+    auto v = (T1)5 ^ y;
+    USE_v;
+  }
+
+  // mask ^ mask
+  {
+    auto k = m1 ^= m;
+    auto v = k ^ m;
+    USE_v;
+  }
+  {
+    auto v = m ^ m1;
+    USE_v;
+  }
+  // mask ^ SCALAR
+  {
+    auto k = m1 ^= a;
+    auto v = m ^ k;
+    USE_v;
+  }
+  {
+    auto v = m ^ 5;
+    USE_v;
+  }
+  // SCALAR ^ mask
+  {
+    auto v = 5 ^ m;
+    USE_v;
+  }
+
+  // simd_view<simd,...> ^ simd_view<simd,...>
+  {
+    simd<T1, 8> k = x1.template select<8, 1>() ^= y1.template select<8, 1>();
+    auto v = k ^ y;
+    USE_v;
+  }
+  {
+    simd<T1, 8> k = x1.template select<8, 1>().template select<8, 1>() ^=
+        y1.template select<8, 1>().template select<8, 1>();
+    auto v = k ^ y;
+    USE_v;
+  }
+  {
+    auto v = x1.template select<8, 1>() ^ y1.template select<8, 1>();
+    USE_v;
+  }
+  {
+    auto v = x1.template select<8, 1>().template select<8, 1>() ^
+             y1.template select<8, 1>().template select<8, 1>();
+    USE_v;
+  }
+  // simd ^ simd_view<simd,...>
+  {
+    auto k = x1 ^= y1.template select<8, 1>();
+    auto v = k ^ y;
+    USE_v;
+  }
+  {
+    auto v = x ^ y1.template select<8, 1>();
+    USE_v;
+  }
+  // simd_view<simd,...> ^ simd
+  {
+    simd<T1, 8> k = x1.template select<8, 1>() ^= y;
+    auto v = k ^ y;
+    USE_v;
+  }
+  {
+    auto v = x1.template select<8, 1>() ^ y;
+    USE_v;
+  }
+
+  // simd_view<simd_mask,...> ^ simd_view<simd_mask,...>
+  {
+    simd_mask<8> k = m1.select<8, 1>() ^= m1.select<8, 1>();
+    auto v = k ^ m;
+    USE_v;
+  }
+  {
+    simd_mask<8> k = m1.select<8, 1>().select<8, 1>() ^=
+        m1.select<8, 1>().select<8, 1>();
+    auto v = k ^ m;
+    USE_v;
+  }
+  {
+    auto v = m1.select<8, 1>() ^ m1.select<8, 1>();
+    USE_v;
+  }
+  {
+    auto v =
+        m1.select<8, 1>().select<8, 1>() ^ m1.select<8, 1>().select<8, 1>();
+    USE_v;
+  }
+  // simd_mask ^ simd_view<simd_mask,...>
+  {
+    auto k = m1 ^= m1.select<8, 1>();
+    auto v = k ^ m;
+    USE_v;
+  }
+  {
+    auto v = m ^ m1.select<8, 1>();
+    USE_v;
+  }
+  {
+    auto v = m ^ m1.select<8, 1>().select<8, 1>();
+    USE_v;
+  }
+  // simd_view<simd_mask,...> ^ simd_mask
+  {
+    simd_mask<8> k = m1.select<8, 1>() ^= m;
+    auto v = k ^ m;
+    USE_v;
+  }
+  {
+    auto v = m1.select<8, 1>() ^ m;
+    USE_v;
+  }
+  {
+    auto v = m1.select<8, 1>().select<8, 1>() ^ m;
+    USE_v;
+  }
+
+  // simd_view<simd,...> ^ SCALAR
+  {
+    simd<T1, 8> k = x1.template select<8, 1>() ^= (T2)5;
+    auto v = k ^ y;
+    USE_v;
+  }
+  {
+    auto v = x1.template select<8, 1>() ^ (T2)5;
+    USE_v;
+  }
+  {
+    auto v = x1.template select<8, 1>().template select<8, 1>() ^ (T2)5;
+    USE_v;
+  }
+  // SCALAR ^ simd_view<simd,...>
+  {
+    auto v = (T1)5 ^ y1.template select<8, 1>();
+    USE_v;
+  }
+  {
+    auto v = (T1)5 ^ y1.template select<8, 1>().template select<8, 1>();
+    USE_v;
+  }
+
+  // simd_view<simd_mask,...> ^ SCALAR
+  {
+    simd_mask<8> k = m1.template select<8, 1>() ^= a;
+    auto v = k ^ m;
+    USE_v;
+  }
+  {
+    auto v = m1.template select<8, 1>() ^ a;
+    USE_v;
+  }
+  {
+    auto v = m1.template select<8, 1>().template select<8, 1>() ^ a;
+    USE_v;
+  }
+  // SCALAR ^ simd_view<simd_mask,...>
+  {
+    auto v = a ^ y1.template select<8, 1>();
+    USE_v;
+  }
+  {
+    auto v = a ^ y1.template select<8, 1>().template select<8, 1>();
+    USE_v;
+  }
+}
+
+template <class T1, class T2>
+[[intel::sycl_explicit_simd]] void bitwise_op_test(simd<long long, 32> *out) {
+  simd<T1, 8> x((T1)10);
+  simd<T1, 8> x1((T1)11);
+  const simd<T2, 8> y((T2)17);
+  simd<T2, 8> y1((T2)19);
+  const simd_mask<8> m(1);
+  simd_mask<8> m1(0);
+
+  bitwise_op_test_impl(x, x1, y, y1, m, m1, out);
+}
+
+[[intel::sycl_explicit_simd]] void bitwise_op_tests(simd<long long, 32> *out) {
+  bitwise_op_test<unsigned char, long long>(out);
+  bitwise_op_test<unsigned char, char>(out);
+  bitwise_op_test<int, unsigned short>(out);
+}
+
+// --- arithmetic
+template <class T1, class T2>
+[[intel::sycl_explicit_simd]] auto
+arith_bin_op_test_impl(const simd<T1, 8> &x, simd<T1, 8> &x1,
+                       const simd<T2, 8> &y, simd<T2, 8> &y1,
+                       simd<long long, 32> *out) {
+  // simd * simd
+  {
+    auto k = x1 *= y;
+    auto v = x * k;
+    USE_v;
+  }
+  {
+    auto v = x * y;
+    USE_v;
+  }
+  // simd * SCALAR
+  {
+    auto k = x1 *= (T2)5;
+    auto v = x * k;
+    USE_v;
+  }
+  {
+    auto v = x * (T2)5;
+    USE_v;
+  }
+  // SCALAR * simd
+  {
+    auto v = (T1)5 * y;
+    USE_v;
+  }
+
+  // simd_view<simd,...> * simd_view<simd,...>
+  {
+    simd<T1, 8> k = x1.template select<8, 1>() *= y1.template select<8, 1>();
+    auto v = x1.template select<8, 1>() * k;
+    USE_v;
+  }
+  {
+    auto v = x1.template select<8, 1>() * y1.template select<8, 1>();
+    USE_v;
+  }
+  // simd * simd_view<simd,...>
+  {
+    auto k = x1 *= y1.template select<8, 1>();
+    auto v = x * k;
+    USE_v;
+  }
+  {
+    auto v = x * y1.template select<8, 1>();
+    USE_v;
+  }
+  // simd_view<simd,...> * simd
+  {
+    simd<T1, 8> k = x1.template select<8, 1>() *= y;
+    auto v = k * y;
+    USE_v;
+  }
+  {
+    auto v = x1.template select<8, 1>() * y;
+    USE_v;
+  }
+
+  // simd_view<simd,...> * SCALAR
+  {
+    simd<T1, 8> k = x1.template select<8, 1>() *= (T2)5;
+    auto v = k * (T2)5;
+    USE_v;
+  }
+  {
+    auto v = x1.template select<8, 1>() * (T2)5;
+    USE_v;
+  }
+  // SCALAR * simd_view<simd,...>
+  {
+    auto v = (T1)5 * y1.template select<8, 1>();
+    USE_v;
+  }
+}
+
+template <class T1, class T2>
+[[intel::sycl_explicit_simd]] void arith_bin_op_test(simd<long long, 32> *out) {
+  simd<T1, 8> x((T1)10);
+  simd<T1, 8> x1((T1)11);
+  const simd<T2, 8> y((T2)17);
+  simd<T2, 8> y1((T2)19);
+
+  arith_bin_op_test_impl(x, x1, y, y1, out);
+}
+
+[[intel::sycl_explicit_simd]] void
+arith_bin_op_tests(simd<long long, 32> *out) {
+  arith_bin_op_test<unsigned char, long long>(out);
+  arith_bin_op_test<unsigned char, char>(out);
+  arith_bin_op_test<int, unsigned short>(out);
+  arith_bin_op_test<float, char>(out);
+  arith_bin_op_test<float, float>(out);
+}
+
+// --- equality comparison
+
+template <class T1, class T2>
+[[intel::sycl_explicit_simd]] auto
+equ_cmp_test_impl(const simd<T1, 8> &x, simd<T1, 8> &x1, const simd<T2, 8> &y,
+                  simd<T2, 8> &y1, const simd_mask<8> &m, simd_mask<8> &m1,
+                  simd<long long, 32> *out) {
+  // simd == simd
+  {
+    auto v = x == y;
+    USE_v;
+  }
+  // simd == SCALAR
+  {
+    auto v = x == (T2)5;
+    USE_v;
+  }
+  // SCALAR == simd
+  {
+    auto v = (T1)5 == y;
+    USE_v;
+  }
+
+  // mask == mask
+  {
+    auto v = m == m1;
+    USE_v;
+  }
+  // mask == SCALAR
+  {
+    auto v = m == 5;
+    USE_v;
+  }
+  // SCALAR == mask
+  {
+    auto v = 5 == m;
+    USE_v;
+  }
+
+  // simd_view<simd,...> == simd_view<simd,...>
+  {
+    auto v = x1.template select<8, 1>() == y1.template select<8, 1>();
+    USE_v;
+  }
+  // simd == simd_view<simd,...>
+  {
+    auto v = x == y1.template select<8, 1>();
+    USE_v;
+  }
+  // simd_view<simd,...> == simd
+  {
+    auto v = x1.template select<8, 1>() == y;
+    USE_v;
+  }
+
+  // simd_view<simd_mask,...> == simd_view<simd_mask,...>
+  {
+    auto v = m1.select<8, 1>() == m1.select<8, 1>();
+    USE_v;
+  }
+  // simd_mask == simd_view<simd_mask,...>
+  {
+    auto v = m == m1.select<8, 1>();
+    USE_v;
+  }
+  // simd_view<simd_mask,...> == simd_mask
+  {
+    auto v = m1.select<8, 1>() == m;
+    USE_v;
+  }
+
+  // simd_view<simd,...> == SCALAR
+  {
+    auto v = x1.template select<8, 1>() == (T2)5;
+    USE_v;
+  }
+  // SCALAR == simd_view<simd,...>
+  {
+    auto v = (T1)5 == y1.template select<8, 1>();
+    USE_v;
+  }
+
+  // simd_view<simd_mask,...> == SCALAR
+  int a = 1;
+  {
+    auto v = m1.select<8, 1>() == a;
+    USE_v;
+  }
+  // SCALAR == simd_view<simd_mask,...>
+  {
+    auto v = a == m1.select<8, 1>();
+    USE_v;
+  }
+}
+
+template <class T1, class T2>
+[[intel::sycl_explicit_simd]] void equ_cmp_test(simd<long long, 32> *out) {
+  simd<T1, 8> x((T1)10);
+  simd<T1, 8> x1((T1)11);
+  const simd<T2, 8> y((T2)17);
+  simd<T2, 8> y1((T2)19);
+  const simd_mask<8> m(1);
+  simd_mask<8> m1(0);
+
+  equ_cmp_test_impl(x, x1, y, y1, m, m1, out);
+}
+
+[[intel::sycl_explicit_simd]] void equ_cmp_tests(simd<long long, 32> *out) {
+  equ_cmp_test<unsigned char, long long>(out);
+  equ_cmp_test<unsigned char, char>(out);
+  equ_cmp_test<int, unsigned short>(out);
+  equ_cmp_test<float, char>(out);
+  equ_cmp_test<unsigned short, float>(out);
+}
+
+// --- comparison
+
+template <class T1, class T2>
+[[intel::sycl_explicit_simd]] auto
+lt_cmp_test_impl(const simd<T1, 8> &x, simd<T1, 8> &x1, const simd<T2, 8> &y,
+                 simd<T2, 8> &y1, const simd_mask<8> &m, simd_mask<8> &m1,
+                 simd<long long, 32> *out) {
+  // simd < simd
+  {
+    auto v = x < y;
+    USE_v;
+  }
+  // simd < SCALAR
+  {
+    auto v = x < (T2)5;
+    USE_v;
+  }
+  // SCALAR < simd
+  {
+    auto v = (T1)5 == y;
+    USE_v;
+  }
+
+  // simd_view<simd,...> < simd_view<simd,...>
+  {
+    auto v = x1.template select<8, 1>() < y1.template select<8, 1>();
+    USE_v;
+  }
+  // simd < simd_view<simd,...>
+  {
+    auto v = x < y1.template select<8, 1>();
+    USE_v;
+  }
+  // simd_view<simd,...> < simd
+  {
+    auto v = x1.template select<8, 1>() < y;
+    USE_v;
+  }
+
+  // simd_view<simd,...> < SCALAR
+  {
+    auto v = x1.template select<8, 1>() < (T2)5;
+    USE_v;
+  }
+  // SCALAR == simd_view<simd,...>
+  {
+    auto v = (T1)5 < y1.template select<8, 1>();
+    USE_v;
+  }
+}
+
+template <class T1, class T2>
+[[intel::sycl_explicit_simd]] void lt_cmp_test(simd<long long, 32> *out) {
+  simd<T1, 8> x((T1)10);
+  simd<T1, 8> x1((T1)11);
+  const simd<T2, 8> y((T2)17);
+  simd<T2, 8> y1((T2)19);
+  const simd_mask<8> m(1);
+  simd_mask<8> m1(0);
+
+  lt_cmp_test_impl(x, x1, y, y1, m, m1, out);
+}
+
+[[intel::sycl_explicit_simd]] void lt_cmp_tests(simd<long long, 32> *out) {
+  lt_cmp_test<unsigned char, long long>(out);
+  lt_cmp_test<unsigned char, char>(out);
+  lt_cmp_test<int, unsigned short>(out);
+  lt_cmp_test<float, char>(out);
+  lt_cmp_test<unsigned short, float>(out);
+}
diff --git a/sycl/test/esimd/regression/simd_wrapper.cpp b/sycl/test/esimd/regression/simd_wrapper.cpp
index 0655e220a6e73..dee22b23fa5b3 100644
--- a/sycl/test/esimd/regression/simd_wrapper.cpp
+++ b/sycl/test/esimd/regression/simd_wrapper.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsyntax-only -Xclang -verify %s
+// RUN: %clangxx -fsycl -fsyntax-only -fsycl-device-only -Xclang -verify %s
 
 #include <limits>
 #include <sycl/ext/intel/experimental/esimd.hpp>
diff --git a/sycl/test/esimd/simd.cpp b/sycl/test/esimd/simd.cpp
index 5b19035211d15..a38b732751312 100644
--- a/sycl/test/esimd/simd.cpp
+++ b/sycl/test/esimd/simd.cpp
@@ -43,6 +43,7 @@ void test_conversion() SYCL_ESIMD_FUNCTION {
   simd<float, 32> f = v;
   simd<char, 32> c = f;
   simd<char, 16> c1 = f.select<16, 1>(0);
+  c.select<32, 1>(0) = f;
   f = v + static_cast<simd<int, 32>>(c);
 }
 
@@ -64,16 +65,24 @@ bool test_simd_format() SYCL_ESIMD_FUNCTION {
          (decltype(ref3)::getSizeX() == 4) && (decltype(ref3)::getSizeY() == 8);
 }
 
-bool test_simd_select() SYCL_ESIMD_FUNCTION {
-  simd<int, 16> v(0, 1);
-  auto ref0 = v.select<4, 2>(1);     // r{1, 3, 5, 7}
-  auto ref1 = v.bit_cast_view<int, 4, 4>(); // 0,1,2,3;
-                                            // 4,5,6,7;
-                                            // 8,9,10,11;
-                                            // 12,13,14,15
-  auto ref2 = ref1.select<2, 1, 2, 2>(0, 1);
-  return ref0[0] == 1 && decltype(ref2)::getSizeX() == 2 &&
-         decltype(ref2)::getStrideY() == 1;
+bool test_simd_select(int a) SYCL_ESIMD_FUNCTION {
+  {
+    simd<float, 32> f = a;
+    simd<char, 32> c1 = 2;
+    c1.select<16, 1>(0) = f.select<16, 1>(0);
+    c1.select<16, 1>(0).select<16, 1>(0) = f.select<16, 1>(0).select<16, 1>(0);
+  }
+  {
+    simd<int, 16> v(0, 1);
+    auto ref0 = v.select<4, 2>(1);            // r{1, 3, 5, 7}
+    auto ref1 = v.bit_cast_view<int, 4, 4>(); // 0,1,2,3;
+                                              // 4,5,6,7;
+                                              // 8,9,10,11;
+                                              // 12,13,14,15
+    auto ref2 = ref1.select<2, 1, 2, 2>(0, 1);
+    return ref0[0] == 1 && decltype(ref2)::getSizeX() == 2 &&
+           decltype(ref2)::getStrideY() == 1;
+  }
 }
 
 bool test_2d_offset() SYCL_ESIMD_FUNCTION {
@@ -111,7 +120,6 @@ bool test_simd_unary_ops() SYCL_ESIMD_FUNCTION {
   v0 <<= v1;
   v1 = -v0;
   v0 = ~v1;
-  v1 = !v0;
   return v1[0] == 1;
 }
 
@@ -259,7 +267,7 @@ bool test_simd_iselect() SYCL_ESIMD_FUNCTION {
   simd<ushort, 8> a(0, 2);
   auto data = v.iselect(a);
   data += 16;
-  v.iupdate(a, data, 1);
+  v.iupdate(a, data, simd_mask<8>(1));
   auto ref = v.select<8, 2>(0);
   return ref[0] == 16 && ref[14] == 32;
 }
diff --git a/sycl/test/esimd/simd_mask.cpp b/sycl/test/esimd/simd_mask.cpp
new file mode 100644
index 0000000000000..a83c1f53545f9
--- /dev/null
+++ b/sycl/test/esimd/simd_mask.cpp
@@ -0,0 +1,78 @@
+// RUN: %clangxx -fsycl -fsycl-device-only -fsyntax-only -Xclang -verify -Xclang -verify-ignore-unexpected=note %s
+// RUN: %clangxx -fsycl -fsyntax-only %s
+
+// This test checks that both host and device compilers can
+// successfully compile simd_mask APIs.
+
+#include <CL/sycl.hpp>
+#include <limits>
+#include <sycl/ext/intel/experimental/esimd.hpp>
+#include <utility>
+
+using namespace sycl::ext::intel::experimental::esimd;
+using namespace cl::sycl;
+
+#define DEFINE_BIN_OP_TEST(op, name)                                           \
+  template <int N>                                                             \
+  SYCL_EXTERNAL SYCL_ESIMD_FUNCTION simd_mask<N> test_impl_##name(             \
+      simd_mask<N> &m1, simd_mask<N> &m2) {                                    \
+    return m1 op m2;                                                           \
+  }                                                                            \
+                                                                               \
+  simd_mask<1> test_impl_1_##name(simd_mask<1> &m1, simd_mask<1> &m2) {        \
+    return test_impl_##name(m1, m2);                                           \
+  }                                                                            \
+                                                                               \
+  simd_mask<17> test_impl_17_##name(simd_mask<17> &m1, simd_mask<17> &m2) {    \
+    return test_impl_##name(m1, m2);                                           \
+  }                                                                            \
+                                                                               \
+  simd_mask<32> test_impl_32_##name(simd_mask<32> &m1, simd_mask<32> &m2) {    \
+    return test_impl_##name(m1, m2);                                           \
+  }
+
+DEFINE_BIN_OP_TEST(&&, and)
+DEFINE_BIN_OP_TEST(||, or)
+DEFINE_BIN_OP_TEST(&, bit_and)
+DEFINE_BIN_OP_TEST(|, bit_or)
+DEFINE_BIN_OP_TEST(^, xor)
+DEFINE_BIN_OP_TEST(==, eq)
+DEFINE_BIN_OP_TEST(!=, ne)
+DEFINE_BIN_OP_TEST(&=, bit_and_eq)
+DEFINE_BIN_OP_TEST(|=, bit_or_eq)
+DEFINE_BIN_OP_TEST(^=, xor_eq)
+
+SYCL_EXTERNAL SYCL_ESIMD_FUNCTION simd_mask<8> misc_tests(bool val) {
+  simd_mask<8> m1(val);   // broadcast constructor
+  simd_mask<8> m2;        // default constructor
+  simd_mask<8> m3(m1[4]); // operator[]
+  simd_mask<8> m4 = !m3;  // operator!
+  static_assert(m4.length == 8, "size() failed");
+  simd<char, 8> ch1(1);
+  simd<char, 8> ch2(2);
+  simd_mask<8> m5 = ch1 > ch2;
+  m1[3] ^= 1;                         // binop on writable single-element view
+  ch1.merge(ch2, m1.select<8, 1>(0)); // simd_view<simd_mask,...> used as mask
+
+  return m5;
+}
+
+SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void compat_test(float *ptr) {
+  simd<unsigned short, 16> pred(1);
+  simd<unsigned int, 16> offsets;
+  simd<int, 8> pred1(1);
+  auto pred2 = pred1.bit_cast_view<unsigned short>();
+
+  // expected-warning@+1 {{deprecated}}
+  auto x1 = gather<float, 16>(ptr, offsets, pred);
+  // expected-warning@+1 {{deprecated}}
+  auto x11 = gather<float, 16>(ptr, offsets, pred2);
+  // expected-warning@+1 {{deprecated}}
+  auto x2 = gather<float, 16>(ptr, offsets, simd<unsigned short, 16>{});
+  simd_mask<16> m1(0);
+  // expected-warning@+1 {{deprecated}}
+  m1 = pred;
+  simd_mask<16> m2(0);
+  // expected-warning@+1 {{deprecated}}
+  m2 = std::move(pred);
+}
diff --git a/sycl/test/esimd/simd_merge.cpp b/sycl/test/esimd/simd_merge.cpp
index 5fed799cf90c8..bd52b914df74c 100644
--- a/sycl/test/esimd/simd_merge.cpp
+++ b/sycl/test/esimd/simd_merge.cpp
@@ -10,7 +10,7 @@ using namespace sycl::ext::intel::experimental::esimd;
 bool test_simd_merge1() __attribute__((sycl_device)) {
   simd<int, 16> v0 = 1;
   simd<int, 16> v1 = 2;
-  simd<ushort, 16> mask = 0;
+  simd_mask<16> mask = 0;
   mask.select<4, 4>(0) = 1;
   v0.merge(v1, mask);
   return v0[0] == 2 && v0[4] == 2 && v0[8] == 2 && v0[12] == 2;
@@ -20,7 +20,7 @@ bool test_simd_merge2() __attribute__((sycl_device)) {
   simd<int, 16> v0 = 1;
   simd<int, 16> v1 = 2;
   simd<int, 16> v2 = 3;
-  simd<ushort, 16> mask = 0;
+  simd_mask<16> mask = 0;
   mask.select<4, 4>(0) = 1;
   v0.merge(v1, v2, (v1 < v2) & mask);
   return v0[0] == 2 && v0[4] == 2 && v0[8] == 2 && v0[12] == 2 && v0[3] == 3 &&
@@ -30,7 +30,7 @@ bool test_simd_merge2() __attribute__((sycl_device)) {
 bool test_simd_merge2d1() __attribute__((sycl_device)) {
   simd<int, 16> v0 = 1;
   simd<int, 16> v1 = 2;
-  simd<ushort, 16> mask = 0;
+  simd_mask<16> mask = 0;
   mask.select<4, 4>(0) = 1;
   auto v0_2d = v0.bit_cast_view<int, 4, 4>();
   v0_2d.merge(v1, mask);
@@ -41,7 +41,7 @@ bool test_simd_merge2d2() __attribute__((sycl_device)) {
   simd<int, 16> v0 = 1;
   simd<int, 16> v1 = 2;
   simd<int, 16> v2 = 3;
-  simd<ushort, 16> mask = 0;
+  simd_mask<16> mask = 0;
   mask.select<4, 4>(0) = 1;
   auto v0_2d = v0.bit_cast_view<int, 4, 4>();
   v0_2d.merge(v1, v2, mask);
diff --git a/sycl/test/esimd/simd_replicate_deprecated.cpp b/sycl/test/esimd/simd_replicate_deprecated.cpp
index 52adc19670b16..07d6d05b54b30 100644
--- a/sycl/test/esimd/simd_replicate_deprecated.cpp
+++ b/sycl/test/esimd/simd_replicate_deprecated.cpp
@@ -9,8 +9,8 @@ using namespace sycl::ext::intel::experimental::esimd;
 bool test_replicate1() {
   simd<int, 8> v0(0, 1);
   // expected-warning@+3 2 {{deprecated}}
-  // expected-note@sycl/ext/intel/experimental/esimd/simd.hpp:* {{}}
-  // expected-note@sycl/ext/intel/experimental/esimd/simd.hpp:* {{has been explicitly marked deprecated here}}
+  // expected-note@sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp:* {{}}
+  // expected-note@sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp:* {{has been explicitly marked deprecated here}}
   auto v0_rep = v0.replicate<4, 2>(2);
 
   return v0[2] == v0_rep[2] && v0[3] == v0_rep[5];
@@ -19,8 +19,8 @@ bool test_replicate1() {
 bool test_replicate2() {
   simd<int, 8> v0(0, 1);
   // expected-warning@+3 2 {{deprecated}}
-  // expected-note@sycl/ext/intel/experimental/esimd/simd.hpp:* {{}}
-  // expected-note@sycl/ext/intel/experimental/esimd/simd.hpp:* {{has been explicitly marked deprecated here}}
+  // expected-note@sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp:* {{}}
+  // expected-note@sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp:* {{has been explicitly marked deprecated here}}
   auto v0_rep = v0.replicate<2, 4, 2>(1);
 
   return v0_rep[0] == v0[1] && v0_rep[1] == v0[2] && v0_rep[2] == v0[5];
@@ -29,8 +29,8 @@ bool test_replicate2() {
 bool test_replicate3() {
   simd<int, 8> v0(0, 1);
   // expected-warning@+3 2 {{deprecated}}
-  // expected-note@sycl/ext/intel/experimental/esimd/simd.hpp:* {{}}
-  // expected-note@sycl/ext/intel/experimental/esimd/simd.hpp:* {{has been explicitly marked deprecated here}}
+  // expected-note@sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp:* {{}}
+  // expected-note@sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp:* {{has been explicitly marked deprecated here}}
   auto v0_rep = v0.replicate<2, 4, 2, 2>(1);
 
   return v0_rep[0] == v0[1] && v0_rep[1] == v0[3] && v0_rep[2] == v0[5];
diff --git a/sycl/test/esimd/simd_subscript.cpp b/sycl/test/esimd/simd_subscript.cpp
index 5e1ae1b1f56fa..bf41f2d8b895b 100644
--- a/sycl/test/esimd/simd_subscript.cpp
+++ b/sycl/test/esimd/simd_subscript.cpp
@@ -65,7 +65,7 @@ void test_simd_writable_subscript() SYCL_ESIMD_FUNCTION {
   v[1] = 0;        // returns simd_view
 
   // CHECK: simd_subscript.cpp:69{{.*}}warning: {{.*}} deprecated
-  // CHECK: sycl/ext/intel/experimental/esimd/simd.hpp:{{.*}} note: {{.*}} has been explicitly marked deprecated here
+  // CHECK: sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp:{{.*}} note: {{.*}} has been explicitly marked deprecated here
   v(1) = 0;
 }
 
@@ -76,7 +76,7 @@ void test_simd_const_subscript() SYCL_ESIMD_FUNCTION {
   cv[1] = 0;
 
   // CHECK: simd_subscript.cpp:80{{.*}}warning: {{.*}} deprecated
-  // CHECK: sycl/ext/intel/experimental/esimd/simd.hpp:{{.*}} note: {{.*}} has been explicitly marked deprecated here
+  // CHECK: sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp:{{.*}} note: {{.*}} has been explicitly marked deprecated here
   int val3 = cv(0);
 }
 
diff --git a/sycl/test/esimd/simd_view.cpp b/sycl/test/esimd/simd_view.cpp
index ffdb3b6f4f791..d7cbb2ed2c35a 100644
--- a/sycl/test/esimd/simd_view.cpp
+++ b/sycl/test/esimd/simd_view.cpp
@@ -6,7 +6,7 @@
 
 using namespace sycl::ext::intel::experimental::esimd;
 
-SYCL_ESIMD_FUNCTION bool test_simd_view_bin_ops() {
+SYCL_ESIMD_FUNCTION auto test_simd_view_bin_ops() {
   simd<int, 16> v0 = 1;
   simd<int, 16> v1 = 2;
   auto ref0 = v0.select<8, 2>(0);
@@ -20,7 +20,44 @@ SYCL_ESIMD_FUNCTION bool test_simd_view_bin_ops() {
   ref0 *= 2;
   ref0 /= ref1;
   ref0 /= 2;
-  return v0[0] == 1;
+  if (v0[0] == 1)
+    return ref0 + (short)3;
+  else
+    return ref0 + ref1;
+}
+
+SYCL_ESIMD_FUNCTION auto test_simd_view_bitwise_ops() {
+  simd<int, 16> v0 = 1;
+  simd<int, 16> v1 = 2;
+  auto ref0 = v0.select<8, 2>(0);
+  auto ref1 = v1.select<8, 2>(0);
+  simd<int, 8> v2 = (ref0 | ref1) & (ref0 | 3);
+  ref0 |= 3;
+  ref0 |= ref1;
+  simd<int, 8> v3 = (ref0 ^ ref1) & (ref0 ^ 3);
+  ref0 ^= 3;
+  ref0 ^= ref1;
+  simd<int, 8> v4 = (ref0 & ref1) | (ref0 & 3);
+  ref0 &= 3;
+  ref0 &= ref1;
+  return ref0;
+}
+
+SYCL_ESIMD_FUNCTION auto test_simd_mask_view_bitwise_ops() {
+  simd_mask<16> v0 = 1;
+  simd_mask<16> v1 = 2;
+  auto ref0 = v0.select<8, 2>(0);
+  auto ref1 = v1.select<8, 2>(0);
+  simd_mask<8> v2 = (ref0 | ref1) & (ref0 | 3);
+  ref0 |= 3;
+  ref0 |= ref1;
+  simd_mask<8> v3 = (ref0 ^ ref1) & (ref0 ^ 3);
+  ref0 ^= 3;
+  ref0 ^= ref1;
+  simd_mask<8> v4 = (ref0 & ref1) | (ref0 & 3);
+  ref0 &= 3;
+  ref0 &= ref1;
+  return ref0;
 }
 
 SYCL_ESIMD_FUNCTION bool test_simd_view_unary_ops() {
@@ -31,7 +68,7 @@ SYCL_ESIMD_FUNCTION bool test_simd_view_unary_ops() {
   ref0 <<= ref1;
   ref1 = -ref0;
   ref0 = ~ref1;
-  auto mask = !ref0;
+  auto mask = !(ref0 < ref1);
   return v1[0] == 1;
 }
 
@@ -54,10 +91,10 @@ SYCL_ESIMD_FUNCTION bool test_simd_view_assign3() {
   simd<int, 64> v1 = 1;
   auto mask = (v0.select<16, 1>(0) > v1.select<16, 1>(0));
   auto mask2 = (v0 > v1);
-  simd<ushort, 64> s = 0;
+  simd_mask<64> s = 0;
   auto g4 = s.bit_cast_view<ushort, 4, 16>();
-  simd<ushort, 16> val = (g4.row(2) & mask);
-  simd<ushort, 16> val1 =
+  simd_mask<16> val = (g4.row(2) & mask);
+  simd_mask<16> val1 =
       (g4.row(2) & mask2.bit_cast_view<ushort, 4, 16>().row(0));
   return val[0] == 0 && val1[0] == 0;
 }
@@ -123,21 +160,21 @@ void test_simd_view_impl_api_ret_types() SYCL_ESIMD_FUNCTION {
   simd<float, 4> x = 0;
   auto v1 =
       x.select<2, 1>(0); // simd_view<simd<float, 4>, region1d_t<float, 2, 1>>
-  static_assert(detail::is_simd_view_v<decltype(v1)>::value, "");
+  static_assert(detail::is_simd_view_type_v<decltype(v1)>, "");
   auto v2 = v1.select<1, 1>(
       0); // simd_view<simd<float, 4>, std::pair<region_base<false, float, 1, 0,
           // 1, 1>, region_base<false, float, 1, 0, 2, 1>>>
-  static_assert(detail::is_simd_view_v<decltype(v1)>::value, "");
+  static_assert(detail::is_simd_view_type_v<decltype(v1)>, "");
 
   auto v2_int = v2.bit_cast_view<int>();
-  static_assert(detail::is_simd_view_v<decltype(v2_int)>::value, "");
+  static_assert(detail::is_simd_view_type_v<decltype(v2_int)>, "");
   auto v2_int_2D = v2.bit_cast_view<int, 1, 1>();
-  static_assert(detail::is_simd_view_v<decltype(v2_int_2D)>::value, "");
+  static_assert(detail::is_simd_view_type_v<decltype(v2_int_2D)>, "");
 
   auto v3 = x.select<2, 1>(2);
   auto &v4 = (v1 += v3);
-  static_assert(detail::is_simd_view_v<decltype(v4)>::value, "");
-  static_assert(detail::is_simd_view_v<decltype(++v4)>::value, "");
+  static_assert(detail::is_simd_view_type_v<decltype(v4)>, "");
+  static_assert(detail::is_simd_view_type_v<decltype(++v4)>, "");
 }
 
 void test_simd_view_subscript() SYCL_ESIMD_FUNCTION {
@@ -169,9 +206,12 @@ void test_simd_view_writeable_subscript() SYCL_ESIMD_FUNCTION {
 void test_simd_view_binop_with_conv_to_scalar() SYCL_ESIMD_FUNCTION {
   simd<ushort, 64> s = 0;
   auto g = s.bit_cast_view<ushort, 4, 16>();
-  auto x = g.row(1) - (g.row(1))[0]; // binary op
-  auto y = g.row(1) & (g.row(1))[0]; // bitwise op
-  auto z = g.row(1) < (g.row(1))[0]; // relational op
+  auto x1 = g.row(1) - (g.row(1))[0]; // binary op
+  auto x2 = (g.row(1))[0] - g.row(1); // binary op
+  auto y1 = g.row(1) & (g.row(1))[0]; // bitwise op
+  auto y2 = (g.row(1))[0] & g.row(1); // bitwise op
+  auto z1 = g.row(1) < (g.row(1))[0]; // relational op
+  auto z2 = (g.row(1))[0] < g.row(1); // relational op
 }
 
 // This code is OK. The result of bit_cast_view should be mapped
@@ -201,3 +241,65 @@ void test_simd_view_len1_binop() SYCL_ESIMD_FUNCTION {
   auto v2 = s.select<2, 1>(0);
   auto x = v1 * v2;
 }
+
+void test_simd_view_assign_op() SYCL_ESIMD_FUNCTION {
+  // multiple elements
+  {
+#define N 4
+    // simd - assign views of different element type
+    simd<float, 32> v1 = 0;
+    simd<short, 16> v2 = 0;
+    // - region is a region type (top-level region)
+    v1.select<N, 2>(0) = v2.select<N, 2>(0);
+    v2.select<N, 2>(0) = v1.select<N, 2>(0);
+    // - region is a std::pair (nested region)
+    v1.select<8, 2>(0).select<N, 1>(1) = v2.select<8, 2>(0).select<N, 1>(1);
+    v2.select<8, 2>(0).select<N, 1>(1) = v1.select<8, 2>(0).select<N, 1>(1);
+    // - first region is top-level, second - nested
+    v1.select<4, 2>(0) = v2.select<8, 2>(0).select<4, 1>(1);
+    // - first region is nested, second - top-level
+    v2.select<8, 2>(0).select<4, 1>(1) = v1.select<4, 2>(0);
+
+    // simd_mask
+    simd_mask<32> m1 = 0;
+    simd_mask<16> m2 = 0;
+    // - region is a region type (top-level region)
+    m1.select<4, 2>(0) = m2.select<4, 2>(0);
+    m2.select<4, 2>(0) = m1.select<4, 2>(0);
+    // - region is a std::pair (nested region)
+    m1.select<8, 2>(0).select<N, 1>(1) = m2.select<8, 2>(0).select<N, 1>(1);
+    m2.select<8, 2>(0).select<N, 1>(1) = m1.select<8, 2>(0).select<N, 1>(1);
+    // - first region is top-level, second - nested
+    m1.select<4, 2>(0) = m2.select<8, 2>(0).select<4, 1>(1);
+    // - first region is nested, second - top-level
+    m2.select<8, 2>(0).select<4, 1>(1) = m1.select<4, 2>(0);
+#undef N
+  }
+  // single element
+  {
+#define N 1
+    // simd - assign views of different element type
+    simd<float, 16> v1 = 0;
+    simd<short, 8> v2 = 0;
+    // - region is a region type (top-level region)
+    v1.select<N, 1>(0) = v2.select<N, 1>(0);
+    v2[0] = v1[0];
+    v2[1] = v1.select<N, 1>(1);
+    // - region is a std::pair (nested region)
+    v1.select<4, 2>(0).select<N, 1>(1) = v2.select<4, 2>(0).select<N, 1>(1);
+    v2.select<4, 2>(0).select<N, 1>(1) = v1.select<4, 2>(0).select<N, 1>(1);
+
+    // simd_mask
+    simd_mask<16> m1 = 0;
+    simd_mask<8> m2 = 0;
+    // - region is a region type (top-level region)
+    m1.select<N, 1>(0) = m2.select<N, 1>(0);
+    m2[0] = m1[0];
+    m2[1] = m1.select<N, 1>(1);
+    // - region is a std::pair (nested region)
+    m1.select<4, 2>(0).select<N, 1>(1) = m2.select<4, 2>(0).select<N, 1>(1);
+    m2.select<4, 2>(0)[1] = m1.select<4, 2>(0)[1];
+    m2.select<4, 2>(0)[2] = m1.select<4, 2>(0).select<N, 1>(2);
+#undef N
+  }
+}
diff --git a/sycl/test/esimd/simd_view_ret_warn.cpp b/sycl/test/esimd/simd_view_ret_warn.cpp
index 072c6299331ba..97324bc7bbecf 100644
--- a/sycl/test/esimd/simd_view_ret_warn.cpp
+++ b/sycl/test/esimd/simd_view_ret_warn.cpp
@@ -7,7 +7,7 @@ using namespace sycl::ext::intel::experimental::esimd;
 // and it should be programmers fault, similar to string_view.
 // However, sometimes we could return simd_view from a function
 // implicitly. This test checks that users will see a warning in such situation.
-simd_view<simd<float, 4>, region1d_t<float, 1, 0>> f1(simd<float, 4> x) {
+simd_view<simd<float, 4>, region1d_t<float, 1, 1>> f1(simd<float, 4> x) {
   // expected-warning@+1 {{address of stack memory associated with parameter 'x' returned}}
   return x[0];
 }