diff --git a/sycl/doc/extensions/README.md b/sycl/doc/extensions/README.md
index 967d6a6e00b84..d9d107cb7a6f6 100644
--- a/sycl/doc/extensions/README.md
+++ b/sycl/doc/extensions/README.md
@@ -13,7 +13,7 @@ DPC++ extensions status:
 | [SYCL_INTEL_deduction_guides](deduction_guides/SYCL_INTEL_deduction_guides.asciidoc)                                        | Supported                                 | |
 | [SYCL_INTEL_device_specific_kernel_queries](DeviceSpecificKernelQueries/SYCL_INTEL_device_specific_kernel_queries.asciidoc) | Proposal                                  | |
 | [SYCL_INTEL_enqueue_barrier](EnqueueBarrier/enqueue_barrier.asciidoc)                                                       | Supported(OpenCL, Level Zero)             | |
-| [SYCL_INTEL_extended_atomics](ExtendedAtomics/SYCL_INTEL_extended_atomics.asciidoc)                                         | Proposal                                  | |
+| [SYCL_INTEL_extended_atomics](ExtendedAtomics/SYCL_INTEL_extended_atomics.asciidoc)                                         | Partially supported(OpenCL: CPU, GPU)     | Not supported: pointer types |
 | [SYCL_INTEL_group_algorithms](GroupAlgorithms/SYCL_INTEL_group_algorithms.asciidoc)                                         | Supported(OpenCL)                         | |
 | [SYCL_INTEL_group_mask](./GroupMask/SYCL_INTEL_group_mask.asciidoc)                                                         | Proposal                                  | |
 | [FPGA selector](IntelFPGA/FPGASelector.md)                                                                                  | Supported                                 | |
diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
index aa7696d45e321..28482dca289cc 100644
--- a/sycl/include/CL/__spirv/spirv_ops.hpp
+++ b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -155,6 +155,7 @@ extern SYCL_EXTERNAL TempRetT __spirv_ImageSampleExplicitLod(SampledType,
       macro(__attribute__((opencl_local)), Arg)
 
 __SPIRV_ATOMICS(__SPIRV_ATOMIC_FLOAT, float)
+__SPIRV_ATOMICS(__SPIRV_ATOMIC_FLOAT, double)
 __SPIRV_ATOMICS(__SPIRV_ATOMIC_SIGNED, int)
 __SPIRV_ATOMICS(__SPIRV_ATOMIC_SIGNED, long)
 __SPIRV_ATOMICS(__SPIRV_ATOMIC_SIGNED, long long)
diff --git a/sycl/include/CL/sycl.hpp b/sycl/include/CL/sycl.hpp
index 06053b487d0fd..e537bf6e7d979 100644
--- a/sycl/include/CL/sycl.hpp
+++ b/sycl/include/CL/sycl.hpp
@@ -23,6 +23,7 @@
 #include <CL/sycl/handler.hpp>
 #include <CL/sycl/id.hpp>
 #include <CL/sycl/image.hpp>
+#include <CL/sycl/intel/atomic.hpp>
 #include <CL/sycl/intel/builtins.hpp>
 #include <CL/sycl/intel/function_pointer.hpp>
 #include <CL/sycl/intel/group_algorithm.hpp>
diff --git a/sycl/include/CL/sycl/detail/defines.hpp b/sycl/include/CL/sycl/detail/defines.hpp
index 7c4381fae189e..59aa54247028d 100644
--- a/sycl/include/CL/sycl/detail/defines.hpp
+++ b/sycl/include/CL/sycl/detail/defines.hpp
@@ -46,3 +46,10 @@
 #warning "No assumptions will be emitted due to no __builtin_assume available"
 #endif
 #endif
+
+// inline constexpr is a C++17 feature
+#if __cplusplus >= 201703L
+#define __SYCL_INLINE_CONSTEXPR inline constexpr
+#else
+#define __SYCL_INLINE_CONSTEXPR static constexpr
+#endif
diff --git a/sycl/include/CL/sycl/detail/spirv.hpp b/sycl/include/CL/sycl/detail/spirv.hpp
index e3c783c7bf65d..a95e81f5506ac 100644
--- a/sycl/include/CL/sycl/detail/spirv.hpp
+++ b/sycl/include/CL/sycl/detail/spirv.hpp
@@ -12,6 +12,7 @@
 #include <CL/__spirv/spirv_vars.hpp>
 #include <CL/sycl/detail/generic_type_traits.hpp>
 #include <CL/sycl/detail/type_traits.hpp>
+#include <CL/sycl/intel/atomic_enums.hpp>
 
 #ifdef __SYCL_DEVICE_ONLY__
 __SYCL_INLINE_NAMESPACE(cl) {
@@ -28,7 +29,7 @@ template <int Dimensions> struct group_scope<group<Dimensions>> {
   static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Workgroup;
 };
 
-template <> struct group_scope<intel::sub_group> {
+template <> struct group_scope<::cl::sycl::intel::sub_group> {
   static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup;
 };
 
@@ -69,6 +70,226 @@ T GroupBroadcast(T x, id<Dimensions> local_id) {
   return __spirv_GroupBroadcast(group_scope<Group>::value, ocl_x, ocl_id);
 }
 
+// Single happens-before means semantics should always apply to all spaces
+// Although consume is unsupported, forwarding to acquire is valid
+static inline constexpr __spv::MemorySemanticsMask::Flag
+getMemorySemanticsMask(intel::memory_order Order) {
+  __spv::MemorySemanticsMask::Flag SpvOrder = __spv::MemorySemanticsMask::None;
+  switch (Order) {
+  case intel::memory_order::relaxed:
+    SpvOrder = __spv::MemorySemanticsMask::None;
+    break;
+  case intel::memory_order::__consume_unsupported:
+  case intel::memory_order::acquire:
+    SpvOrder = __spv::MemorySemanticsMask::Acquire;
+    break;
+  case intel::memory_order::release:
+    SpvOrder = __spv::MemorySemanticsMask::Release;
+    break;
+  case intel::memory_order::acq_rel:
+    SpvOrder = __spv::MemorySemanticsMask::AcquireRelease;
+    break;
+  case intel::memory_order::seq_cst:
+    SpvOrder = __spv::MemorySemanticsMask::SequentiallyConsistent;
+    break;
+  }
+  return static_cast<__spv::MemorySemanticsMask::Flag>(
+      SpvOrder | __spv::MemorySemanticsMask::SubgroupMemory |
+      __spv::MemorySemanticsMask::WorkgroupMemory |
+      __spv::MemorySemanticsMask::CrossWorkgroupMemory);
+}
+
+static inline constexpr __spv::Scope::Flag getScope(intel::memory_scope Scope) {
+  switch (Scope) {
+  case intel::memory_scope::work_item:
+    return __spv::Scope::Invocation;
+  case intel::memory_scope::sub_group:
+    return __spv::Scope::Subgroup;
+  case intel::memory_scope::work_group:
+    return __spv::Scope::Workgroup;
+  case intel::memory_scope::device:
+    return __spv::Scope::Device;
+  case intel::memory_scope::system:
+    return __spv::Scope::CrossDevice;
+  }
+}
+
+template <typename T, access::address_space AddressSpace>
+inline typename detail::enable_if_t<std::is_integral<T>::value, T>
+AtomicCompareExchange(multi_ptr<T, AddressSpace> MPtr,
+                      intel::memory_scope Scope, intel::memory_order Success,
+                      intel::memory_order Failure, T Desired, T Expected) {
+  auto SPIRVSuccess = getMemorySemanticsMask(Success);
+  auto SPIRVFailure = getMemorySemanticsMask(Failure);
+  auto SPIRVScope = getScope(Scope);
+  auto *Ptr = MPtr.get();
+  return __spirv_AtomicCompareExchange(Ptr, SPIRVScope, SPIRVSuccess,
+                                       SPIRVFailure, Desired, Expected);
+}
+
+template <typename T, access::address_space AddressSpace>
+inline typename detail::enable_if_t<std::is_floating_point<T>::value, T>
+AtomicCompareExchange(multi_ptr<T, AddressSpace> MPtr,
+                      intel::memory_scope Scope, intel::memory_order Success,
+                      intel::memory_order Failure, T Desired, T Expected) {
+  using I = detail::make_unsinged_integer_t<T>;
+  auto SPIRVSuccess = getMemorySemanticsMask(Success);
+  auto SPIRVFailure = getMemorySemanticsMask(Failure);
+  auto SPIRVScope = getScope(Scope);
+  auto *PtrInt =
+      reinterpret_cast<typename multi_ptr<I, AddressSpace>::pointer_t>(
+          MPtr.get());
+  I DesiredInt = detail::bit_cast<I>(Desired);
+  I ExpectedInt = detail::bit_cast<I>(Expected);
+  I ResultInt = __spirv_AtomicCompareExchange(
+      PtrInt, SPIRVScope, SPIRVSuccess, SPIRVFailure, DesiredInt, ExpectedInt);
+  return detail::bit_cast<T>(ResultInt);
+}
+
+template <typename T, access::address_space AddressSpace>
+inline typename detail::enable_if_t<std::is_integral<T>::value, T>
+AtomicLoad(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
+           intel::memory_order Order) {
+  auto *Ptr = MPtr.get();
+  auto SPIRVOrder = getMemorySemanticsMask(Order);
+  auto SPIRVScope = getScope(Scope);
+  return __spirv_AtomicLoad(Ptr, SPIRVScope, SPIRVOrder);
+}
+
+template <typename T, access::address_space AddressSpace>
+inline typename detail::enable_if_t<std::is_floating_point<T>::value, T>
+AtomicLoad(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
+           intel::memory_order Order) {
+  using I = detail::make_unsinged_integer_t<T>;
+  auto *PtrInt =
+      reinterpret_cast<typename multi_ptr<I, AddressSpace>::pointer_t>(
+          MPtr.get());
+  auto SPIRVOrder = getMemorySemanticsMask(Order);
+  auto SPIRVScope = getScope(Scope);
+  I ResultInt = __spirv_AtomicLoad(PtrInt, SPIRVScope, SPIRVOrder);
+  return detail::bit_cast<T>(ResultInt);
+}
+
+template <typename T, access::address_space AddressSpace>
+inline typename detail::enable_if_t<std::is_integral<T>::value>
+AtomicStore(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
+            intel::memory_order Order, T Value) {
+  auto *Ptr = MPtr.get();
+  auto SPIRVOrder = getMemorySemanticsMask(Order);
+  auto SPIRVScope = getScope(Scope);
+  __spirv_AtomicStore(Ptr, SPIRVScope, SPIRVOrder, Value);
+}
+
+template <typename T, access::address_space AddressSpace>
+inline typename detail::enable_if_t<std::is_floating_point<T>::value>
+AtomicStore(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
+            intel::memory_order Order, T Value) {
+  using I = detail::make_unsinged_integer_t<T>;
+  auto *PtrInt =
+      reinterpret_cast<typename multi_ptr<I, AddressSpace>::pointer_t>(
+          MPtr.get());
+  auto SPIRVOrder = getMemorySemanticsMask(Order);
+  auto SPIRVScope = getScope(Scope);
+  I ValueInt = detail::bit_cast<I>(Value);
+  __spirv_AtomicStore(PtrInt, SPIRVScope, SPIRVOrder, ValueInt);
+}
+
+template <typename T, access::address_space AddressSpace>
+inline typename detail::enable_if_t<std::is_integral<T>::value, T>
+AtomicExchange(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
+               intel::memory_order Order, T Value) {
+  auto *Ptr = MPtr.get();
+  auto SPIRVOrder = getMemorySemanticsMask(Order);
+  auto SPIRVScope = getScope(Scope);
+  return __spirv_AtomicExchange(Ptr, SPIRVScope, SPIRVOrder, Value);
+}
+
+template <typename T, access::address_space AddressSpace>
+inline typename detail::enable_if_t<std::is_floating_point<T>::value, T>
+AtomicExchange(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
+               intel::memory_order Order, T Value) {
+  using I = detail::make_unsinged_integer_t<T>;
+  auto *PtrInt =
+      reinterpret_cast<typename multi_ptr<I, AddressSpace>::pointer_t>(
+          MPtr.get());
+  auto SPIRVOrder = getMemorySemanticsMask(Order);
+  auto SPIRVScope = getScope(Scope);
+  I ValueInt = detail::bit_cast<I>(Value);
+  I ResultInt =
+      __spirv_AtomicExchange(PtrInt, SPIRVScope, SPIRVOrder, ValueInt);
+  return detail::bit_cast<T>(ResultInt);
+}
+
+template <typename T, access::address_space AddressSpace>
+inline typename detail::enable_if_t<std::is_integral<T>::value, T>
+AtomicIAdd(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
+           intel::memory_order Order, T Value) {
+  auto *Ptr = MPtr.get();
+  auto SPIRVOrder = getMemorySemanticsMask(Order);
+  auto SPIRVScope = getScope(Scope);
+  return __spirv_AtomicIAdd(Ptr, SPIRVScope, SPIRVOrder, Value);
+}
+
+template <typename T, access::address_space AddressSpace>
+inline typename detail::enable_if_t<std::is_integral<T>::value, T>
+AtomicISub(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
+           intel::memory_order Order, T Value) {
+  auto *Ptr = MPtr.get();
+  auto SPIRVOrder = getMemorySemanticsMask(Order);
+  auto SPIRVScope = getScope(Scope);
+  return __spirv_AtomicISub(Ptr, SPIRVScope, SPIRVOrder, Value);
+}
+
+template <typename T, access::address_space AddressSpace>
+inline typename detail::enable_if_t<std::is_integral<T>::value, T>
+AtomicAnd(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
+          intel::memory_order Order, T Value) {
+  auto *Ptr = MPtr.get();
+  auto SPIRVOrder = getMemorySemanticsMask(Order);
+  auto SPIRVScope = getScope(Scope);
+  return __spirv_AtomicAnd(Ptr, SPIRVScope, SPIRVOrder, Value);
+}
+
+template <typename T, access::address_space AddressSpace>
+inline typename detail::enable_if_t<std::is_integral<T>::value, T>
+AtomicOr(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
+         intel::memory_order Order, T Value) {
+  auto *Ptr = MPtr.get();
+  auto SPIRVOrder = getMemorySemanticsMask(Order);
+  auto SPIRVScope = getScope(Scope);
+  return __spirv_AtomicOr(Ptr, SPIRVScope, SPIRVOrder, Value);
+}
+
+template <typename T, access::address_space AddressSpace>
+inline typename detail::enable_if_t<std::is_integral<T>::value, T>
+AtomicXor(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
+          intel::memory_order Order, T Value) {
+  auto *Ptr = MPtr.get();
+  auto SPIRVOrder = getMemorySemanticsMask(Order);
+  auto SPIRVScope = getScope(Scope);
+  return __spirv_AtomicXor(Ptr, SPIRVScope, SPIRVOrder, Value);
+}
+
+template <typename T, access::address_space AddressSpace>
+inline typename detail::enable_if_t<std::is_integral<T>::value, T>
+AtomicMin(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
+          intel::memory_order Order, T Value) {
+  auto *Ptr = MPtr.get();
+  auto SPIRVOrder = getMemorySemanticsMask(Order);
+  auto SPIRVScope = getScope(Scope);
+  return __spirv_AtomicMin(Ptr, SPIRVScope, SPIRVOrder, Value);
+}
+
+template <typename T, access::address_space AddressSpace>
+inline typename detail::enable_if_t<std::is_integral<T>::value, T>
+AtomicMax(multi_ptr<T, AddressSpace> MPtr, intel::memory_scope Scope,
+          intel::memory_order Order, T Value) {
+  auto *Ptr = MPtr.get();
+  auto SPIRVOrder = getMemorySemanticsMask(Order);
+  auto SPIRVScope = getScope(Scope);
+  return __spirv_AtomicMax(Ptr, SPIRVScope, SPIRVOrder, Value);
+}
+
 } // namespace spirv
 } // namespace detail
 } // namespace sycl
diff --git a/sycl/include/CL/sycl/intel/atomic.hpp b/sycl/include/CL/sycl/intel/atomic.hpp
new file mode 100644
index 0000000000000..bbc49ecc210d9
--- /dev/null
+++ b/sycl/include/CL/sycl/intel/atomic.hpp
@@ -0,0 +1,13 @@
+//==---------------- atomic.hpp - SYCL_INTEL_extended_atomics --------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/intel/atomic_enums.hpp>
+#include <CL/sycl/intel/atomic_fence.hpp>
+#include <CL/sycl/intel/atomic_ref.hpp>
diff --git a/sycl/include/CL/sycl/intel/atomic_enums.hpp b/sycl/include/CL/sycl/intel/atomic_enums.hpp
new file mode 100644
index 0000000000000..a85c9902cd524
--- /dev/null
+++ b/sycl/include/CL/sycl/intel/atomic_enums.hpp
@@ -0,0 +1,103 @@
+//==---------------- atomic_enums.hpp - SYCL_INTEL_extended_atomics enums --==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/__spirv/spirv_ops.hpp>
+#include <CL/sycl/access/access.hpp>
+#include <CL/sycl/detail/defines.hpp>
+#include <CL/sycl/detail/helpers.hpp>
+
+#ifndef __SYCL_DEVICE_ONLY__
+#include <atomic>
+#endif
+#include <type_traits>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace intel {
+
+enum class memory_order : int {
+  relaxed,
+  acquire,
+  __consume_unsupported, // helps optimizer when mapping to std::memory_order
+  release,
+  acq_rel,
+  seq_cst
+};
+__SYCL_INLINE_CONSTEXPR memory_order memory_order_relaxed =
+    memory_order::relaxed;
+__SYCL_INLINE_CONSTEXPR memory_order memory_order_acquire =
+    memory_order::acquire;
+__SYCL_INLINE_CONSTEXPR memory_order memory_order_release =
+    memory_order::release;
+__SYCL_INLINE_CONSTEXPR memory_order memory_order_acq_rel =
+    memory_order::acq_rel;
+__SYCL_INLINE_CONSTEXPR memory_order memory_order_seq_cst =
+    memory_order::seq_cst;
+
+enum class memory_scope : int {
+  work_item,
+  sub_group,
+  work_group,
+  device,
+  system
+};
+__SYCL_INLINE_CONSTEXPR memory_scope memory_scope_work_item =
+    memory_scope::work_item;
+__SYCL_INLINE_CONSTEXPR memory_scope memory_scope_sub_group =
+    memory_scope::sub_group;
+__SYCL_INLINE_CONSTEXPR memory_scope memory_scope_work_group =
+    memory_scope::work_group;
+__SYCL_INLINE_CONSTEXPR memory_scope memory_scope_device = memory_scope::device;
+__SYCL_INLINE_CONSTEXPR memory_scope memory_scope_system = memory_scope::system;
+
+#ifndef __SYCL_DEVICE_ONLY__
+namespace detail {
+// Cannot use switch statement in constexpr before C++14
+// Nested ternary conditions in else branch required for C++11
+#if __cplusplus >= 201402L
+static inline constexpr std::memory_order
+getStdMemoryOrder(::cl::sycl::intel::memory_order order) {
+  switch (order) {
+  case memory_order::relaxed:
+    return std::memory_order_relaxed;
+  case memory_order::__consume_unsupported:
+    return std::memory_order_consume;
+  case memory_order::acquire:
+    return std::memory_order_acquire;
+  case memory_order::release:
+    return std::memory_order_release;
+  case memory_order::acq_rel:
+    return std::memory_order_acq_rel;
+  case memory_order::seq_cst:
+    return std::memory_order_seq_cst;
+  }
+}
+#else
+static inline constexpr std::memory_order
+getStdMemoryOrder(::cl::sycl::intel::memory_order order) {
+  return (order == memory_order::relaxed)
+             ? std::memory_order_relaxed
+             : (order == memory_order::__consume_unsupported)
+                   ? std::memory_order_consume
+                   : (order == memory_order::acquire)
+                         ? std::memory_order_acquire
+                         : (order == memory_order::release)
+                               ? std::memory_order_release
+                               : (order == memory_order::acq_rel)
+                                     ? std::memory_order_acq_rel
+                                     : std::memory_order_seq_cst;
+}
+#endif // __cplusplus
+} // namespace detail
+#endif // __SYCL_DEVICE_ONLY__
+
+} // namespace intel
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/atomic_fence.hpp b/sycl/include/CL/sycl/intel/atomic_fence.hpp
new file mode 100644
index 0000000000000..aba95c060b878
--- /dev/null
+++ b/sycl/include/CL/sycl/intel/atomic_fence.hpp
@@ -0,0 +1,40 @@
+//==----- atomic_fence.hpp - SYCL_INTEL_extended_atomics atomic_fence ------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/__spirv/spirv_ops.hpp>
+#include <CL/sycl/detail/spirv.hpp>
+#include <CL/sycl/intel/atomic_enums.hpp>
+
+#ifndef __SYCL_DEVICE_ONLY__
+#include <atomic>
+#endif
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace intel {
+namespace detail {
+using namespace cl::sycl::detail;
+}
+
+static inline void atomic_fence(memory_order order, memory_scope scope) {
+#ifdef __SYCL_DEVICE_ONLY__
+  auto SPIRVOrder = detail::spirv::getMemorySemanticsMask(order);
+  auto SPIRVScope = detail::spirv::getScope(scope);
+  __spirv_MemoryBarrier(SPIRVScope, static_cast<uint32_t>(SPIRVOrder));
+#else
+  (void)scope;
+  auto StdOrder = detail::getStdMemoryOrder(order);
+  atomic_thread_fence(StdOrder);
+#endif
+}
+
+} // namespace intel
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/atomic_ref.hpp b/sycl/include/CL/sycl/intel/atomic_ref.hpp
new file mode 100644
index 0000000000000..89202cb7ca08c
--- /dev/null
+++ b/sycl/include/CL/sycl/intel/atomic_ref.hpp
@@ -0,0 +1,531 @@
+//==----- atomic_ref.hpp - SYCL_INTEL_extended_atomics atomic_ref ----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/__spirv/spirv_ops.hpp>
+#include <CL/sycl/access/access.hpp>
+#include <CL/sycl/atomic.hpp>
+#include <CL/sycl/detail/defines.hpp>
+#include <CL/sycl/detail/helpers.hpp>
+#include <CL/sycl/intel/atomic_enums.hpp>
+
+#ifndef __SYCL_DEVICE_ONLY__
+#include <atomic>
+#endif
+#include <type_traits>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+
+// Forward declaration
+template <typename pointerT, access::address_space AddressSpace>
+class multi_ptr;
+
+namespace intel {
+namespace detail {
+
+// Import from detail:: into intel::detail:: to improve readability later
+using namespace ::cl::sycl::detail;
+
+using memory_order = cl::sycl::intel::memory_order;
+using memory_scope = cl::sycl::intel::memory_scope;
+
+template <typename T>
+using IsValidAtomicType =
+    bool_constant<std::is_arithmetic<T>::value || std::is_pointer<T>::value>;
+
+template <cl::sycl::access::address_space AS>
+using IsValidAtomicAddressSpace =
+    bool_constant<AS == access::address_space::global_space ||
+                  AS == access::address_space::local_space>;
+
+// DefaultOrder parameter is limited to read-modify-write orders
+template <memory_order Order>
+using IsValidDefaultOrder = bool_constant<Order == memory_order::relaxed ||
+                                          Order == memory_order::acq_rel ||
+                                          Order == memory_order::seq_cst>;
+
+template <memory_order ReadModifyWriteOrder> struct memory_order_traits;
+
+template <> struct memory_order_traits<memory_order::relaxed> {
+  static constexpr memory_order read_order = memory_order::relaxed;
+  static constexpr memory_order write_order = memory_order::relaxed;
+};
+
+template <> struct memory_order_traits<memory_order::acq_rel> {
+  static constexpr memory_order read_order = memory_order::acquire;
+  static constexpr memory_order write_order = memory_order::release;
+};
+
+template <> struct memory_order_traits<memory_order::seq_cst> {
+  static constexpr memory_order read_order = memory_order::seq_cst;
+  static constexpr memory_order write_order = memory_order::seq_cst;
+};
+
+// Cannot use switch statement in constexpr before C++14
+// Nested ternary conditions in else branch required for C++11
+#if __cplusplus >= 201402L
+inline constexpr memory_order getLoadOrder(memory_order order) {
+  switch (order) {
+  case memory_order_relaxed:
+    return memory_order_relaxed;
+
+  case memory_order_acquire:
+  case memory_order::__consume_unsupported:
+  case memory_order_acq_rel:
+  case memory_order_release:
+    return memory_order_acquire;
+
+  case memory_order_seq_cst:
+    return memory_order_seq_cst;
+  }
+}
+#else
+inline constexpr memory_order getLoadOrder(memory_order order) {
+  return (order == memory_order_relaxed)
+             ? memory_order_relaxed
+             : (order == memory_order_seq_cst) ? memory_order_seq_cst
+                                               : memory_order_acquire;
+}
+#endif
+
+template <typename T, typename = void> struct bit_equal;
+
+template <typename T>
+struct bit_equal<T, typename detail::enable_if_t<std::is_integral<T>::value>> {
+  bool operator()(const T &lhs, const T &rhs) { return lhs == rhs; }
+};
+
+template <> struct bit_equal<float> {
+  bool operator()(const float &lhs, const float &rhs) {
+    auto LhsInt = detail::bit_cast<uint32_t>(lhs);
+    auto RhsInt = detail::bit_cast<uint32_t>(rhs);
+    return LhsInt == RhsInt;
+  }
+};
+
+template <> struct bit_equal<double> {
+  bool operator()(const double &lhs, const double &rhs) {
+    auto LhsInt = detail::bit_cast<uint64_t>(lhs);
+    auto RhsInt = detail::bit_cast<uint64_t>(rhs);
+    return LhsInt == RhsInt;
+  }
+};
+
+// Functionality for any atomic of type T, reused by partial specializations
+template <typename T, memory_order DefaultOrder, memory_scope DefaultScope,
+          access::address_space AddressSpace>
+class atomic_ref_base {
+  static_assert(
+      detail::IsValidAtomicType<T>::value,
+      "Invalid atomic type.  Valid types are arithmetic and pointer types");
+  static_assert(!std::is_same<T, bool>::value,
+                "intel::atomic_ref does not support bool type");
+  static_assert(!(std::is_same<T, char>::value ||
+                  std::is_same<T, signed char>::value ||
+                  std::is_same<T, unsigned char>::value),
+                "intel::atomic_ref does not support char type");
+  static_assert(!(std::is_same<T, short>::value ||
+                  std::is_same<T, unsigned short>::value),
+                "intel::atomic_ref does not support short type");
+  static_assert(!std::is_pointer<T>::value,
+                "intel::atomic_ref does not yet support pointer types");
+  static_assert(detail::IsValidAtomicAddressSpace<AddressSpace>::value,
+                "Invalid atomic address_space.  Valid address spaces are: "
+                "global_space, local_space");
+  static_assert(
+      detail::IsValidDefaultOrder<DefaultOrder>::value,
+      "Invalid default memory_order for atomics.  Valid defaults are: "
+      "relaxed, acq_rel, seq_cst");
+
+public:
+  using value_type = T;
+  static constexpr size_t required_alignment = sizeof(T);
+  static constexpr bool is_always_lock_free =
+      detail::IsValidAtomicType<T>::value;
+  static constexpr memory_order default_read_order =
+      detail::memory_order_traits<DefaultOrder>::read_order;
+  static constexpr memory_order default_write_order =
+      detail::memory_order_traits<DefaultOrder>::write_order;
+  static constexpr memory_order default_read_modify_write_order = DefaultOrder;
+  static constexpr memory_scope default_scope = DefaultScope;
+
+  bool is_lock_free() const noexcept {
+    return detail::IsValidAtomicType<T>::value;
+  }
+
+#ifdef __SYCL_DEVICE_ONLY__
+  explicit atomic_ref_base(T &ref) : ptr(multi_ptr<T, AddressSpace>(&ref)) {}
+#else
+  // FIXME: This reinterpret_cast is UB, but happens to work for now
+  explicit atomic_ref_base(T &ref)
+      : ptr(reinterpret_cast<std::atomic<T> *>(&ref)) {}
+#endif
+  // Our implementation of copy constructor could be trivial
+  // Defined this way for consistency with standard atomic_ref
+  atomic_ref_base(const atomic_ref_base &ref) noexcept { ptr = ref.ptr; };
+  atomic_ref_base &operator=(const atomic_ref_base &) = delete;
+
+  void store(T operand, memory_order order = default_write_order,
+             memory_scope scope = default_scope) const noexcept {
+#ifdef __SYCL_DEVICE_ONLY__
+    detail::spirv::AtomicStore(ptr, scope, order, operand);
+#else
+    (void)scope;
+    ptr->store(operand, detail::getStdMemoryOrder(order));
+#endif
+  }
+
+  T operator=(T desired) const noexcept {
+    store(desired);
+    return desired;
+  }
+
+  T load(memory_order order = default_read_order,
+         memory_scope scope = default_scope) const noexcept {
+#ifdef __SYCL_DEVICE_ONLY__
+    return detail::spirv::AtomicLoad(ptr, scope, order);
+#else
+    (void)scope;
+    return ptr->load(detail::getStdMemoryOrder(order));
+#endif
+  }
+
+  operator T() const noexcept { return load(); }
+
+  T exchange(T operand, memory_order order = default_read_modify_write_order,
+             memory_scope scope = default_scope) const noexcept {
+#ifdef __SYCL_DEVICE_ONLY__
+    return detail::spirv::AtomicExchange(ptr, scope, order, operand);
+#else
+    (void)scope;
+    return ptr->exchange(operand, detail::getStdMemoryOrder(order));
+#endif
+  }
+
+  bool compare_exchange_strong(T &expected, T desired, memory_order success,
+                               memory_order failure,
+                               memory_scope scope = default_scope) const
+      noexcept {
+#ifdef __SYCL_DEVICE_ONLY__
+    T value = detail::spirv::AtomicCompareExchange(ptr, scope, success, failure,
+                                                   desired, expected);
+    bool succeeded = detail::bit_equal<T>()(value, expected);
+    if (!succeeded) {
+      expected = value;
+    }
+    return succeeded;
+#else
+    (void)scope;
+    return ptr->compare_exchange_strong(expected, desired,
+                                        detail::getStdMemoryOrder(success),
+                                        detail::getStdMemoryOrder(failure));
+#endif
+  }
+
+  bool
+  compare_exchange_strong(T &expected, T desired,
+                          memory_order order = default_read_modify_write_order,
+                          memory_scope scope = default_scope) const noexcept {
+    return compare_exchange_strong(expected, desired, order, order, scope);
+  }
+
+  bool compare_exchange_weak(T &expected, T desired, memory_order success,
+                             memory_order failure,
+                             memory_scope scope = default_scope) const
+      noexcept {
+    // SPIR-V AtomicCompareExchangeWeak is deprecated and equivalent to
+    // AtomicCompareExchange. For now, use AtomicCompareExchange on device and
+    // compare_exchange_weak on host
+#ifdef __SYCL_DEVICE_ONLY__
+    return compare_exchange_strong(expected, desired, success, failure, scope);
+#else
+    (void)scope;
+    return ptr->compare_exchange_weak(expected, desired,
+                                      detail::getStdMemoryOrder(success),
+                                      detail::getStdMemoryOrder(failure));
+#endif
+  }
+
+  bool
+  compare_exchange_weak(T &expected, T desired,
+                        memory_order order = default_read_modify_write_order,
+                        memory_scope scope = default_scope) const noexcept {
+    return compare_exchange_weak(expected, desired, order, order, scope);
+  }
+
+protected:
+#ifdef __SYCL_DEVICE_ONLY__
+  multi_ptr<T, AddressSpace> ptr;
+#else
+  std::atomic<T> *ptr;
+#endif
+};
+
+// Hook allowing partial specializations to inherit atomic_ref_base
+template <typename T, memory_order DefaultOrder, memory_scope DefaultScope,
+          access::address_space AddressSpace, typename = void>
+class atomic_ref_impl
+    : public atomic_ref_base<T, DefaultOrder, DefaultScope, AddressSpace> {
+public:
+  using atomic_ref_base<T, DefaultOrder, DefaultScope,
+                        AddressSpace>::atomic_ref_base;
+};
+
+// Partial specialization for integral types
+template <typename T, memory_order DefaultOrder, memory_scope DefaultScope,
+          access::address_space AddressSpace>
+class atomic_ref_impl<T, DefaultOrder, DefaultScope, AddressSpace,
+                      typename detail::enable_if_t<std::is_integral<T>::value>>
+    : public atomic_ref_base<T, DefaultOrder, DefaultScope, AddressSpace> {
+
+public:
+  using value_type = T;
+  using difference_type = value_type;
+  static constexpr size_t required_alignment = sizeof(T);
+  static constexpr bool is_always_lock_free =
+      detail::IsValidAtomicType<T>::value;
+  static constexpr memory_order default_read_order =
+      detail::memory_order_traits<DefaultOrder>::read_order;
+  static constexpr memory_order default_write_order =
+      detail::memory_order_traits<DefaultOrder>::write_order;
+  static constexpr memory_order default_read_modify_write_order = DefaultOrder;
+  static constexpr memory_scope default_scope = DefaultScope;
+
+  using atomic_ref_base<T, DefaultOrder, DefaultScope,
+                        AddressSpace>::atomic_ref_base;
+  using atomic_ref_base<T, DefaultOrder, DefaultScope, AddressSpace>::load;
+  using atomic_ref_base<T, DefaultOrder, DefaultScope,
+                        AddressSpace>::compare_exchange_weak;
+
+  T fetch_add(T operand, memory_order order = default_read_modify_write_order,
+              memory_scope scope = default_scope) const noexcept {
+#ifdef __SYCL_DEVICE_ONLY__
+    return detail::spirv::AtomicIAdd(ptr, scope, order, operand);
+#else
+    (void)scope;
+    return ptr->fetch_add(operand, detail::getStdMemoryOrder(order));
+#endif
+  }
+
+  T operator+=(T operand) const noexcept {
+    return fetch_add(operand) + operand;
+  }
+
+  T operator++(int) const noexcept {
+    // TODO: use AtomicIIncrement as an optimization
+    return fetch_add(1);
+  }
+
+  T operator++() const noexcept {
+    // TODO: use AtomicIIncrement as an optimization
+    return fetch_add(1) + 1;
+  }
+
+  T fetch_sub(T operand, memory_order order = default_read_modify_write_order,
+              memory_scope scope = default_scope) const noexcept {
+#ifdef __SYCL_DEVICE_ONLY__
+    return detail::spirv::AtomicISub(ptr, scope, order, operand);
+#else
+    (void)scope;
+    return ptr->fetch_sub(operand, detail::getStdMemoryOrder(order));
+#endif
+  }
+
+  T operator-=(T operand) const noexcept {
+    return fetch_sub(operand) - operand;
+  }
+
+  T operator--(int) const noexcept {
+    // TODO: use AtomicIDecrement as an optimization
+    return fetch_sub(1);
+  }
+
+  T operator--() const noexcept {
+    // TODO: use AtomicIDecrement as an optimization
+    return fetch_sub(1) - 1;
+  }
+
+  T fetch_and(T operand, memory_order order = default_read_modify_write_order,
+              memory_scope scope = default_scope) const noexcept {
+#ifdef __SYCL_DEVICE_ONLY__
+    return detail::spirv::AtomicAnd(ptr, scope, order, operand);
+#else
+    (void)scope;
+    return ptr->fetch_and(operand, detail::getStdMemoryOrder(order));
+#endif
+  }
+
+  T operator&=(T operand) const noexcept {
+    return fetch_and(operand) & operand;
+  }
+
+  T fetch_or(T operand, memory_order order = default_read_modify_write_order,
+             memory_scope scope = default_scope) const noexcept {
+#ifdef __SYCL_DEVICE_ONLY__
+    return detail::spirv::AtomicOr(ptr, scope, order, operand);
+#else
+    (void)scope;
+    return ptr->fetch_or(operand, detail::getStdMemoryOrder(order));
+#endif
+  }
+
+  T operator|=(T operand) const noexcept { return fetch_or(operand) | operand; }
+
+  T fetch_xor(T operand, memory_order order = default_read_modify_write_order,
+              memory_scope scope = default_scope) const noexcept {
+#ifdef __SYCL_DEVICE_ONLY__
+    return detail::spirv::AtomicXor(ptr, scope, order, operand);
+#else
+    (void)scope;
+    return ptr->fetch_xor(operand, detail::getStdMemoryOrder(order));
+#endif
+  }
+
+  T operator^=(T operand) const noexcept {
+    return fetch_xor(operand) ^ operand;
+  }
+
+  T fetch_min(T operand, memory_order order = default_read_modify_write_order,
+              memory_scope scope = default_scope) const noexcept {
+#ifdef __SYCL_DEVICE_ONLY__
+    return detail::spirv::AtomicMin(ptr, scope, order, operand);
+#else
+    auto load_order = detail::getLoadOrder(order);
+    T old = load(load_order, scope);
+    while (operand < old &&
+           !compare_exchange_weak(old, operand, order, scope)) {
+    }
+    return old;
+#endif
+  }
+
+  T fetch_max(T operand, memory_order order = default_read_modify_write_order,
+              memory_scope scope = default_scope) const noexcept {
+#ifdef __SYCL_DEVICE_ONLY__
+    return detail::spirv::AtomicMax(ptr, scope, order, operand);
+#else
+    auto load_order = detail::getLoadOrder(order);
+    T old = load(load_order, scope);
+    while (operand > old &&
+           !compare_exchange_weak(old, operand, order, scope)) {
+    }
+    return old;
+#endif
+  }
+
+private:
+  using atomic_ref_base<T, DefaultOrder, DefaultScope, AddressSpace>::ptr;
+};
+
+// Partial specialization for floating-point types
+// TODO: Leverage floating-point SPIR-V atomics instead of emulation
+template <typename T, memory_order DefaultOrder, memory_scope DefaultScope,
+          access::address_space AddressSpace>
+class atomic_ref_impl<
+    T, DefaultOrder, DefaultScope, AddressSpace,
+    typename detail::enable_if_t<std::is_floating_point<T>::value>>
+    : public atomic_ref_base<T, DefaultOrder, DefaultScope, AddressSpace> {
+
+public:
+  using value_type = T;
+  using difference_type = value_type;
+  static constexpr size_t required_alignment = sizeof(T);
+  static constexpr bool is_always_lock_free =
+      detail::IsValidAtomicType<T>::value;
+  static constexpr memory_order default_read_order =
+      detail::memory_order_traits<DefaultOrder>::read_order;
+  static constexpr memory_order default_write_order =
+      detail::memory_order_traits<DefaultOrder>::write_order;
+  static constexpr memory_order default_read_modify_write_order = DefaultOrder;
+  static constexpr memory_scope default_scope = DefaultScope;
+
+  using atomic_ref_base<T, DefaultOrder, DefaultScope,
+                        AddressSpace>::atomic_ref_base;
+  using atomic_ref_base<T, DefaultOrder, DefaultScope, AddressSpace>::load;
+  using atomic_ref_base<T, DefaultOrder, DefaultScope,
+                        AddressSpace>::compare_exchange_weak;
+
+  T fetch_add(T operand, memory_order order = default_read_modify_write_order,
+              memory_scope scope = default_scope) const noexcept {
+    auto load_order = detail::getLoadOrder(order);
+    T expected = load(load_order, scope);
+    T desired;
+    do {
+      desired = expected + operand;
+    } while (!compare_exchange_weak(expected, desired, order, scope));
+    return expected;
+  }
+
+  T operator+=(T operand) const noexcept {
+    return fetch_add(operand) + operand;
+  }
+
+  T fetch_sub(T operand, memory_order order = default_read_modify_write_order,
+              memory_scope scope = default_scope) const noexcept {
+    auto load_order = detail::getLoadOrder(order);
+    T expected = load(load_order, scope);
+    T desired;
+    do {
+      desired = expected - operand;
+    } while (!compare_exchange_weak(expected, desired, order, scope));
+    return expected;
+  }
+
+  T operator-=(T operand) const noexcept {
+    return fetch_sub(operand) - operand;
+  }
+
+  T fetch_min(T operand, memory_order order = default_read_modify_write_order,
+              memory_scope scope = default_scope) const noexcept {
+    auto load_order = detail::getLoadOrder(order);
+    T old = load(load_order, scope);
+    while (operand < old &&
+           !compare_exchange_weak(old, operand, order, scope)) {
+    }
+    return old;
+  }
+
+  T fetch_max(T operand, memory_order order = default_read_modify_write_order,
+              memory_scope scope = default_scope) const noexcept {
+    auto load_order = detail::getLoadOrder(order);
+    T old = load(load_order, scope);
+    while (operand > old &&
+           !compare_exchange_weak(old, operand, order, scope)) {
+    }
+    return old;
+  }
+
+private:
+  using atomic_ref_base<T, DefaultOrder, DefaultScope, AddressSpace>::ptr;
+};
+
+// Partial specialization for pointer types
+template <typename T, memory_order DefaultOrder, memory_scope DefaultScope,
+          access::address_space AddressSpace>
+class atomic_ref_impl<T *, DefaultOrder, DefaultScope, AddressSpace,
+                      typename detail::enable_if_t<std::is_pointer<T>::value>>
+    : public atomic_ref_base<T *, DefaultOrder, DefaultScope, AddressSpace> {
+  // TODO: Implement partial specialization for pointer types
+};
+
+} // namespace detail
+
+template <typename T, memory_order DefaultOrder, memory_scope DefaultScope,
+          access::address_space AddressSpace>
+class atomic_ref : public detail::atomic_ref_impl<T, DefaultOrder, DefaultScope,
+                                                  AddressSpace> {
+public:
+  using detail::atomic_ref_impl<T, DefaultOrder, DefaultScope,
+                                AddressSpace>::atomic_ref_impl;
+};
+
+} // namespace intel
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/test/atomic_ref/add.cpp b/sycl/test/atomic_ref/add.cpp
new file mode 100644
index 0000000000000..b152166e4f966
--- /dev/null
+++ b/sycl/test/atomic_ref/add.cpp
@@ -0,0 +1,189 @@
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <numeric>
+#include <vector>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <typename T>
+void add_fetch_test(queue q, size_t N) {
+  T sum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), 0);
+  {
+    buffer<T> sum_buf(&sum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
+      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(sum[0]);
+        out[gid] = atm.fetch_add(T(1));
+      });
+    });
+  }
+
+  // All work-items increment by 1, so final value should be equal to N
+  assert(sum == N);
+
+  // Fetch returns original value: will be in [0, N-1]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == 0 && *max_e == N - 1);
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T>
+void add_plus_equal_test(queue q, size_t N) {
+  T sum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), 0);
+  {
+    buffer<T> sum_buf(&sum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
+      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(sum[0]);
+        out[gid] = atm += T(1);
+      });
+    });
+  }
+
+  // All work-items increment by 1, so final value should be equal to N
+  assert(sum == N);
+
+  // += returns updated value: will be in [1, N]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == 1 && *max_e == N);
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T>
+void add_pre_inc_test(queue q, size_t N) {
+  T sum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), 0);
+  {
+    buffer<T> sum_buf(&sum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
+      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(sum[0]);
+        out[gid] = ++atm;
+      });
+    });
+  }
+
+  // All work-items increment by 1, so final value should be equal to N
+  assert(sum == N);
+
+  // Pre-increment returns updated value: will be in [1, N]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == 1 && *max_e == N);
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T>
+void add_post_inc_test(queue q, size_t N) {
+  T sum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), 0);
+  {
+    buffer<T> sum_buf(&sum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
+      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(sum[0]);
+        out[gid] = atm++;
+      });
+    });
+  }
+
+  // All work-items increment by 1, so final value should be equal to N
+  assert(sum == N);
+
+  // Post-increment returns original value: will be in [0, N-1]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == 0 && *max_e == N - 1);
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T>
+void add_test(queue q, size_t N) {
+  add_fetch_test<T>(q, N);
+  add_plus_equal_test<T>(q, N);
+  add_pre_inc_test<T>(q, N);
+  add_post_inc_test<T>(q, N);
+}
+
+// Floating-point types do not support pre- or post-increment
+template <>
+void add_test<float>(queue q, size_t N) {
+  add_fetch_test<float>(q, N);
+  add_plus_equal_test<float>(q, N);
+}
+template <>
+void add_test<double>(queue q, size_t N) {
+  add_fetch_test<double>(q, N);
+  add_plus_equal_test<double>(q, N);
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+
+  // TODO: Enable missing tests when supported
+  add_test<int>(q, N);
+  add_test<unsigned int>(q, N);
+  add_test<long>(q, N);
+  add_test<unsigned long>(q, N);
+  add_test<long long>(q, N);
+  add_test<unsigned long long>(q, N);
+  add_test<float>(q, N);
+  add_test<double>(q, N);
+  //add_test<char*>(q, N);
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/sycl/test/atomic_ref/compare_exchange.cpp b/sycl/test/atomic_ref/compare_exchange.cpp
new file mode 100644
index 0000000000000..8f563fccb65fd
--- /dev/null
+++ b/sycl/test/atomic_ref/compare_exchange.cpp
@@ -0,0 +1,75 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <typename T>
+class compare_exchange_kernel;
+
+template <typename T>
+void compare_exchange_test(queue q, size_t N) {
+  const T initial = std::numeric_limits<T>::max();
+  T compare_exchange = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), 0);
+  {
+    buffer<T> compare_exchange_buf(&compare_exchange, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto exc = compare_exchange_buf.template get_access<access::mode::read_write>(cgh);
+      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<compare_exchange_kernel<T>>(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(exc[0]);
+        T result = initial;
+        bool success = atm.compare_exchange_strong(result, (T)gid);
+        if (success) {
+          out[gid] = result;
+        } else {
+          out[gid] = gid;
+        }
+      });
+    });
+  }
+
+  // Only one work-item should have received the initial sentinel value
+  assert(std::count(output.begin(), output.end(), initial) == 1);
+
+  // All other values should be the index itself or the sentinel value
+  for (int i = 0; i < N; ++i) {
+    assert(output[i] == T(i) || output[i] == initial);
+  }
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+
+  // TODO: Enable missing tests when supported
+  compare_exchange_test<int>(q, N);
+  compare_exchange_test<unsigned int>(q, N);
+  compare_exchange_test<long>(q, N);
+  compare_exchange_test<unsigned long>(q, N);
+  compare_exchange_test<long long>(q, N);
+  compare_exchange_test<unsigned long long>(q, N);
+  compare_exchange_test<float>(q, N);
+  compare_exchange_test<double>(q, N);
+  //compare_exchange_test<char*>(q, N);
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/sycl/test/atomic_ref/exchange.cpp b/sycl/test/atomic_ref/exchange.cpp
new file mode 100644
index 0000000000000..2ce1292cfdd55
--- /dev/null
+++ b/sycl/test/atomic_ref/exchange.cpp
@@ -0,0 +1,68 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <typename T>
+class exchange_kernel;
+
+template <typename T>
+void exchange_test(queue q, size_t N) {
+  const T initial = std::numeric_limits<T>::max();
+  T exchange = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), 0);
+  {
+    buffer<T> exchange_buf(&exchange, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto exc = exchange_buf.template get_access<access::mode::read_write>(cgh);
+      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<exchange_kernel<T>>(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(exc[0]);
+        out[gid] = atm.exchange(gid);
+      });
+    });
+  }
+
+  // Only one work-item should have received the initial sentinel value
+  assert(std::count(output.begin(), output.end(), initial) == 1);
+
+  // All other values should be unique; each work-item replaces the value it reads with its own ID
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+
+  // TODO: Enable missing tests when supported
+  exchange_test<int>(q, N);
+  exchange_test<unsigned int>(q, N);
+  exchange_test<long>(q, N);
+  exchange_test<unsigned long>(q, N);
+  exchange_test<long long>(q, N);
+  exchange_test<unsigned long long>(q, N);
+  exchange_test<float>(q, N);
+  exchange_test<double>(q, N);
+  //exchange_test<char*>(q, N);
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/sycl/test/atomic_ref/load.cpp b/sycl/test/atomic_ref/load.cpp
new file mode 100644
index 0000000000000..274191b9a5ac3
--- /dev/null
+++ b/sycl/test/atomic_ref/load.cpp
@@ -0,0 +1,65 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <typename T>
+class load_kernel;
+
+template <typename T>
+void load_test(queue q, size_t N) {
+  T initial = 42;
+  T load = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), 0);
+  {
+    buffer<T> load_buf(&load, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto ld = load_buf.template get_access<access::mode::read_write>(cgh);
+      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<load_kernel<T>>(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(ld[0]);
+        out[gid] = atm.load();
+      });
+    });
+  }
+
+  // All work-items should read the same value
+  // Atomicity isn't tested here, but support for load() is
+  assert(std::all_of(output.begin(), output.end(), [&](T x) { return (x == initial); }));
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+
+  // TODO: Enable missing tests when supported
+  load_test<int>(q, N);
+  load_test<unsigned int>(q, N);
+  load_test<long>(q, N);
+  load_test<unsigned long>(q, N);
+  load_test<long long>(q, N);
+  load_test<unsigned long long>(q, N);
+  load_test<float>(q, N);
+  load_test<double>(q, N);
+  //load_test<char*>(q, N);
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/sycl/test/atomic_ref/max.cpp b/sycl/test/atomic_ref/max.cpp
new file mode 100644
index 0000000000000..38bb755fb5132
--- /dev/null
+++ b/sycl/test/atomic_ref/max.cpp
@@ -0,0 +1,74 @@
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// XFAIL: cuda
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <numeric>
+#include <vector>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <typename T>
+void max_test(queue q, size_t N) {
+  T initial = std::numeric_limits<T>::lowest();
+  T val = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), std::numeric_limits<T>::max());
+  {
+    buffer<T> val_buf(&val, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
+      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(val[0]);
+
+        // +1 accounts for lowest() returning 0 for unsigned types
+        out[gid] = atm.fetch_max(T(gid) + 1);
+      });
+    });
+  }
+
+  // Final value should be equal to N
+  assert(val == N);
+
+  // Only one work-item should have received the initial value
+  assert(std::count(output.begin(), output.end(), initial) == 1);
+
+  // fetch_max returns original value
+  // Intermediate values should all be >= initial value
+  for (int i = 0; i < N; ++i) {
+    assert(output[i] >= initial);
+  }
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+
+  // TODO: Enable missing tests when supported
+  max_test<int>(q, N);
+  max_test<unsigned int>(q, N);
+  max_test<long>(q, N);
+  max_test<unsigned long>(q, N);
+  max_test<long long>(q, N);
+  max_test<unsigned long long>(q, N);
+  max_test<float>(q, N);
+  max_test<double>(q, N);
+  //max_test<char*>(q, N);
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/sycl/test/atomic_ref/min.cpp b/sycl/test/atomic_ref/min.cpp
new file mode 100644
index 0000000000000..385fc13552a0d
--- /dev/null
+++ b/sycl/test/atomic_ref/min.cpp
@@ -0,0 +1,72 @@
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// XFAIL: cuda
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <numeric>
+#include <vector>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <typename T>
+void min_test(queue q, size_t N) {
+  T initial = std::numeric_limits<T>::max();
+  T val = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), 0);
+  {
+    buffer<T> val_buf(&val, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
+      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(val[0]);
+        out[gid] = atm.fetch_min(T(gid));
+      });
+    });
+  }
+
+  // Final value should be equal to 0
+  assert(val == 0);
+
+  // Only one work-item should have received the initial value
+  assert(std::count(output.begin(), output.end(), initial) == 1);
+
+  // fetch_min returns original value
+  // Intermediate values should all be <= initial value
+  for (int i = 0; i < N; ++i) {
+    assert(output[i] <= initial);
+  }
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+
+  // TODO: Enable missing tests when supported
+  min_test<int>(q, N);
+  min_test<unsigned int>(q, N);
+  min_test<long>(q, N);
+  min_test<unsigned long>(q, N);
+  min_test<long long>(q, N);
+  min_test<unsigned long long>(q, N);
+  min_test<float>(q, N);
+  min_test<double>(q, N);
+  //min_test<char*>(q, N);
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/sycl/test/atomic_ref/store.cpp b/sycl/test/atomic_ref/store.cpp
new file mode 100644
index 0000000000000..eebdba5ced095
--- /dev/null
+++ b/sycl/test/atomic_ref/store.cpp
@@ -0,0 +1,61 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <typename T>
+class store_kernel;
+
+template <typename T>
+void store_test(queue q, size_t N) {
+  T initial = std::numeric_limits<T>::max();
+  T store = initial;
+  {
+    buffer<T> store_buf(&store, 1);
+    q.submit([&](handler &cgh) {
+      auto st = store_buf.template get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<store_kernel<T>>(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(st[0]);
+        atm.store(T(gid));
+      });
+    });
+  }
+
+  // The initial value should have been overwritten by a work-item ID
+  // Atomicity isn't tested here, but support for store() is
+  assert(store != initial);
+  assert(store >= T(0) && store <= T(N - 1));
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+
+  // TODO: Enable missing tests when supported
+  store_test<int>(q, N);
+  store_test<unsigned int>(q, N);
+  store_test<long>(q, N);
+  store_test<unsigned long>(q, N);
+  store_test<long long>(q, N);
+  store_test<unsigned long long>(q, N);
+  store_test<float>(q, N);
+  store_test<double>(q, N);
+  //store_test<char*>(q, N);
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/sycl/test/atomic_ref/sub.cpp b/sycl/test/atomic_ref/sub.cpp
new file mode 100644
index 0000000000000..52e338048e7be
--- /dev/null
+++ b/sycl/test/atomic_ref/sub.cpp
@@ -0,0 +1,189 @@
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <numeric>
+#include <vector>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <typename T>
+void sub_fetch_test(queue q, size_t N) {
+  T val = N;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), 0);
+  {
+    buffer<T> val_buf(&val, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
+      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(val[0]);
+        out[gid] = atm.fetch_sub(T(1));
+      });
+    });
+  }
+
+  // All work-items decrement by 1, so final value should be equal to 0
+  assert(val == 0);
+
+  // Fetch returns original value: will be in [1, N]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == 1 && *max_e == N);
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T>
+void sub_plus_equal_test(queue q, size_t N) {
+  T val = N;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), 0);
+  {
+    buffer<T> val_buf(&val, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
+      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(val[0]);
+        out[gid] = atm -= T(1);
+      });
+    });
+  }
+
+  // All work-items decrement by 1, so final value should be equal to 0
+  assert(val == 0);
+
+  // -= returns updated value: will be in [0, N-1]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == 0 && *max_e == N - 1);
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T>
+void sub_pre_dec_test(queue q, size_t N) {
+  T val = N;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), 0);
+  {
+    buffer<T> val_buf(&val, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
+      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(val[0]);
+        out[gid] = --atm;
+      });
+    });
+  }
+
+  // All work-items decrement by 1, so final value should be equal to 0
+  assert(val == 0);
+
+  // Pre-decrement returns updated value: will be in [0, N-1]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == 0 && *max_e == N - 1);
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T>
+void sub_post_dec_test(queue q, size_t N) {
+  T val = N;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), 0);
+  {
+    buffer<T> val_buf(&val, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
+      auto out = output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref<T, intel::memory_order::relaxed, intel::memory_scope::device, access::address_space::global_space>(val[0]);
+        out[gid] = atm--;
+      });
+    });
+  }
+
+  // All work-items decrement by 1, so final value should be equal to 0
+  assert(val == 0);
+
+  // Post-decrement returns original value: will be in [1, N]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == 1 && *max_e == N);
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T>
+void sub_test(queue q, size_t N) {
+  sub_fetch_test<T>(q, N);
+  sub_plus_equal_test<T>(q, N);
+  sub_pre_dec_test<T>(q, N);
+  sub_post_dec_test<T>(q, N);
+}
+
+// Floating-point types do not support pre- or post-decrement
+template <>
+void sub_test<float>(queue q, size_t N) {
+  sub_fetch_test<float>(q, N);
+  sub_plus_equal_test<float>(q, N);
+}
+template <>
+void sub_test<double>(queue q, size_t N) {
+  sub_fetch_test<double>(q, N);
+  sub_plus_equal_test<double>(q, N);
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+
+  // TODO: Enable missing tests when supported
+  sub_test<int>(q, N);
+  sub_test<unsigned int>(q, N);
+  sub_test<long>(q, N);
+  sub_test<unsigned long>(q, N);
+  sub_test<long long>(q, N);
+  sub_test<unsigned long long>(q, N);
+  sub_test<float>(q, N);
+  sub_test<double>(q, N);
+  //sub_test<char*>(q, N);
+
+  std::cout << "Test passed." << std::endl;
+}