From 71829b7742e9f454a4ee432f97b3c50982be2d8f Mon Sep 17 00:00:00 2001
From: Suryaprakash Shanmugam <suryaprakash.shanmugam@intel.com>
Date: Tue, 28 Oct 2025 22:37:37 -0700
Subject: [PATCH 1/4] Add CuTe Matrix Transpose tutorial

---
 examples/common/sycl_cute_common.hpp          |  16 ++
 examples/cute/tutorial/CMakeLists.txt         |   5 +
 .../cute/tutorial/transpose/copy_direct.h     | 141 +++++++++++++++
 examples/cute/tutorial/transpose/copy_smem.h  | 148 ++++++++++++++++
 .../transpose/tiled_transpose_sycl.cpp        |  29 ++++
 .../cute/tutorial/transpose/transpose_naive.h | 115 ++++++++++++
 .../cute/tutorial/transpose/transpose_smem.h  | 163 ++++++++++++++++++
 examples/cute/tutorial/transpose/util.h       | 103 +++++++++++
 include/cute/util/compat/traits.hpp           |   2 +-
 include/cutlass/platform/platform.h           |   2 +
 10 files changed, 723 insertions(+), 1 deletion(-)
 create mode 100644 examples/cute/tutorial/transpose/copy_direct.h
 create mode 100644 examples/cute/tutorial/transpose/copy_smem.h
 create mode 100644 examples/cute/tutorial/transpose/tiled_transpose_sycl.cpp
 create mode 100644 examples/cute/tutorial/transpose/transpose_naive.h
 create mode 100644 examples/cute/tutorial/transpose/transpose_smem.h
 create mode 100644 examples/cute/tutorial/transpose/util.h
diff --git a/examples/common/sycl_cute_common.hpp b/examples/common/sycl_cute_common.hpp
index 4875a1a57b..1018d043d6 100644
--- a/examples/common/sycl_cute_common.hpp
+++ b/examples/common/sycl_cute_common.hpp
@@ -94,6 +94,22 @@ zero_fill(InTensor &X)
     X(i) = T(0);
 }
 
+template <typename T>
+void
+random_fill(std::vector<T> &X) {
+
+  for (int i = 0; i < X.size(); i++)
+    X[i] = random_value<T>();
+}
+
+template <typename T>
+void
+zero_fill(std::vector<T> &X) {
+  for (int i = 0; i < X.size(); i++)
+    X[i] = T(0);
+}
+
+
 // Pack sub-byte types in a gmem tensor.
 // On input, the backing array holds one sub-byte value per byte.
 // On exit, the backing array contains packed values.
diff --git a/examples/cute/tutorial/CMakeLists.txt b/examples/cute/tutorial/CMakeLists.txt
index 673e968e60..d868a21e1b 100644
--- a/examples/cute/tutorial/CMakeLists.txt
+++ b/examples/cute/tutorial/CMakeLists.txt
@@ -45,6 +45,11 @@ if (CUTLASS_ENABLE_SYCL)
     tiled_copy_sycl.cpp
   )
 
+  cutlass_example_add_executable(
+    cute_tutorial_tiled_transpose
+    transpose/tiled_transpose_sycl.cpp
+  )
+
   cutlass_example_add_executable(
     cute_tutorial_tiled_copy_if
     tiled_copy_if_sycl.cpp
diff --git a/examples/cute/tutorial/transpose/copy_direct.h b/examples/cute/tutorial/transpose/copy_direct.h
new file mode 100644
index 0000000000..10861b360e
--- /dev/null
+++ b/examples/cute/tutorial/transpose/copy_direct.h
@@ -0,0 +1,141 @@
+#pragma once
+
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// copy kernel adapted from
+// https://github.com/NVIDIA/cutlass/blob/main/examples/cute/tutorial/tiled_copy.cu
+
+#include <cute/util/compat.hpp>
+#include <sycl/sycl.hpp>
+
+#include <cute/tensor.hpp>
+
+#include "cutlass/util/print_error.hpp"
+#include "util.h"
+
+#include <iomanip>
+
+template <class TensorS, class TensorD, class ThreadLayout>
+void copy_kernel(TensorS S, TensorD D, ThreadLayout) {
+  using namespace cute;
+
+  // Slice the tiled tensors
+  Tensor tile_S = S(make_coord(_, _), compat::work_group_id::x(),
+                    compat::work_group_id::y()); // (BlockShape_M, BlockShape_N)
+  Tensor tile_D = D(make_coord(_, _), compat::work_group_id::x(),
+                    compat::work_group_id::y()); // (BlockShape_M, BlockShape_N)
+
+  // Construct a partitioning of the tile among threads with the given thread
+  // arrangement.
+
+  // Concept:                         Tensor  ThrLayout       ThrIndex
+  Tensor thr_tile_S = local_partition(
+      tile_S, ThreadLayout{}, compat::local_id::x()); // (ThrValM, ThrValN)
+  Tensor thr_tile_D = local_partition(
+      tile_D, ThreadLayout{}, compat::local_id::x()); // (ThrValM, ThrValN)
+                                                      //
+
+  // Construct a register-backed Tensor with the same shape as each thread's
+  // partition Use make_tensor to try to match the layout of thr_tile_S
+  Tensor fragment = make_tensor_like(thr_tile_S); // (ThrValM, ThrValN)
+
+  // Copy from GMEM to RMEM and from RMEM to GMEM
+  copy(thr_tile_S, fragment);
+  copy(fragment, thr_tile_D);
+}
+
+template <typename Element> void copy_direct(TransposeParams<Element> params) {
+  //
+  // Given a 2D shape, perform an efficient copy
+  //
+
+  using namespace cute;
+
+  //
+  // Make tensors
+  //
+  auto tensor_shape = make_shape(params.M, params.N);
+  auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
+  auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{});
+  Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
+  Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);
+
+  //
+  // Tile tensors
+  //
+
+  // Define a statically sized block (M, N).
+  // Note, by convention, capital letters are used to represent static modes.
+  auto block_shape = make_shape(Int<1>{}, Int<16384>{});
+
+  if ((size<0>(tensor_shape) % size<0>(block_shape)) ||
+      (size<1>(tensor_shape) % size<1>(block_shape))) {
+    std::cerr << "The tensor shape must be divisible by the block shape."
+              << std::endl;
+  }
+  // Equivalent check to the above
+  if (not evenly_divides(tensor_shape, block_shape)) {
+    std::cerr << "Expected the block_shape to evenly divide the tensor shape."
+              << std::endl;
+  }
+
+  // Tile the tensor (m, n) ==> ((M, N), m', n') where (M, N) is the static tile
+  // shape, and modes (m', n') correspond to the number of tiles.
+  //
+  // These will be used to determine the CUDA kernel grid dimensions.
+  Tensor tiled_tensor_S =
+      tiled_divide(tensor_S, block_shape); // ((M, N), m', n')
+  Tensor tiled_tensor_D =
+      tiled_divide(tensor_D, block_shape); // ((M, N), m', n')
+
+  // Thread arrangement
+  Layout thr_layout =
+      make_layout(make_shape(Int<1>{}, Int<1024>{}), LayoutRight{});
+
+  //
+  // Determine grid and block dimensions
+  //
+
+  auto gridDim = compat::dim3(
+      size<1>(tiled_tensor_S),
+      size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n'
+  auto blockDim = compat::dim3(size(thr_layout));
+
+  //
+  // Launch the kernel
+  //
+  compat::launch<copy_kernel<decltype(tiled_tensor_S), decltype(tiled_tensor_D),
+                             decltype(thr_layout)>>(
+      gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, thr_layout);
+}
diff --git a/examples/cute/tutorial/transpose/copy_smem.h b/examples/cute/tutorial/transpose/copy_smem.h
new file mode 100644
index 0000000000..18f3fef90d
--- /dev/null
+++ b/examples/cute/tutorial/transpose/copy_smem.h
@@ -0,0 +1,148 @@
+#pragma once
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <cute/util/compat.hpp>
+#include <sycl/sycl.hpp>
+
+#include <cute/tensor.hpp>
+
+#include "cutlass/util/print_error.hpp"
+#include "util.h"
+
+#include "cutlass/detail/layout.hpp"
+
+// Shared Storage for aligned addresses
+template <class Element, class SmemLayout> struct SharedStorageCopy {
+  cute::array_aligned<Element, cute::cosize_v<SmemLayout>> smem;
+};
+
+template <class TensorS, class TensorD, class ThreadLayout, class SmemLayout,
+          int smem_size>
+void copySmemKernel(TensorS const S, TensorD const D, ThreadLayout,
+                    SmemLayout) {
+  using namespace cute;
+  using Element = typename TensorS::value_type;
+
+  // Use Shared Storage structure to allocate aligned SMEM addresses.
+  using SharedStorage = SharedStorageCopy<Element, SmemLayout>;
+  auto smem = compat::local_mem<Element[smem_size]>();
+  SharedStorage &shared_storage = *reinterpret_cast<SharedStorage *>(smem);
+
+  Tensor gS = S(make_coord(_, _), compat::work_group_id::x(),
+                compat::work_group_id::y()); // (bM, bN)
+  Tensor gD = D(make_coord(_, _), compat::work_group_id::x(),
+                compat::work_group_id::y()); // (bN, bM)
+
+  Tensor sS = make_tensor(make_smem_ptr(shared_storage.smem.data()),
+                          SmemLayout{}); // (bN, bM)
+
+  auto tiled_copy_load = make_tiled_copy(
+      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
+      ThreadLayout{});
+
+  auto tiled_copy_store = make_tiled_copy(
+      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
+      ThreadLayout{});
+  //
+  // Construct a Tensor corresponding to each thread's slice.
+  auto thr_copy_load = tiled_copy_load.get_thread_slice(compat::local_id::x());
+  auto thr_copy_store =
+      tiled_copy_store.get_thread_slice(compat::local_id::x());
+
+  Tensor tSgS = thr_copy_load.partition_S(gS);
+  Tensor tSsS = thr_copy_load.partition_D(sS);
+  //
+  Tensor tDsS = thr_copy_store.partition_D(sS);
+  Tensor tDgD = thr_copy_store.partition_D(gD);
+
+  copy(tiled_copy_load, tSgS, tSsS);
+
+  cp_async_fence();
+  cp_async_wait<0>();
+  syncthreads();
+  //
+  copy(tiled_copy_store, tDsS, tDgD);
+}
+
+template <typename Element> void copy_smem(TransposeParams<Element> params) {
+
+  using namespace cute;
+
+  //
+  // Make tensors
+  //
+  auto tensor_shape = make_shape(params.M, params.N);
+  auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
+  auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{});
+  Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
+  Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);
+
+  //
+  // Tile tensors
+  using bM = Int<1>;
+  using bN = Int<8192>;
+
+  auto block_shape = make_shape(bM{}, bN{}); // (bM, bN)
+
+  auto smem_layout = make_layout(block_shape, LayoutRight{});
+
+  Tensor tiled_tensor_S =
+      tiled_divide(tensor_S, block_shape); // ((bM, bN), m', n')
+  Tensor tiled_tensor_D =
+      tiled_divide(tensor_D, block_shape); // ((bN, bM), n', m')
+
+  auto threadLayout =
+      make_layout(make_shape(Int<1>{}, Int<1024>{}), LayoutRight{});
+
+  //
+  // Determine grid and block dimensions
+  //
+
+  dim3 gridDim(
+      size<1>(tiled_tensor_S),
+      size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n'
+  dim3 blockDim(size(threadLayout)); // 256 threads
+
+  constexpr int smem_size =
+      int(sizeof(SharedStorageCopy<Element, decltype(smem_layout)>));
+
+  //
+  // Launch the kernel
+  //
+  compat::launch<
+      copySmemKernel<decltype(tiled_tensor_S), decltype(tiled_tensor_D),
+                     decltype(threadLayout), decltype(smem_layout), smem_size>>(
+      gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, threadLayout,
+      smem_layout);
+}
diff --git a/examples/cute/tutorial/transpose/tiled_transpose_sycl.cpp b/examples/cute/tutorial/transpose/tiled_transpose_sycl.cpp
new file mode 100644
index 0000000000..c179fa6485
--- /dev/null
+++ b/examples/cute/tutorial/transpose/tiled_transpose_sycl.cpp
@@ -0,0 +1,29 @@
+#include "copy_direct.h"
+#include "copy_smem.h"
+#include "transpose_naive.h"
+#include "transpose_smem.h"
+#include "util.h"
+
+int main(int argc, char const **argv) {
+
+  using Element = float;
+
+  int size = 16384;
+  int M = size, N = size, iterations = 10;
+
+  std::cout << "Matrix size: " << M << " x " << N << std::endl;
+
+  printf("Baseline copy.\n");
+  benchmark<Element, false>(copy_direct<Element>, M, N, iterations);
+
+  printf("\nNaive transpose (no smem):\n");
+  benchmark<Element>(transpose_naive<Element>, M, N, iterations);
+
+  printf("\nCopy through SMEM.\n");
+  benchmark<Element, false>(copy_smem<Element>, M, N, iterations);
+
+  printf("\nTranspose through SMEM.:\n");
+  benchmark<Element>(transpose_smem<Element>, M, N, iterations);
+
+  return 0;
+}
diff --git a/examples/cute/tutorial/transpose/transpose_naive.h b/examples/cute/tutorial/transpose/transpose_naive.h
new file mode 100644
index 0000000000..d73203a1af
--- /dev/null
+++ b/examples/cute/tutorial/transpose/transpose_naive.h
@@ -0,0 +1,115 @@
+#pragma once
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#include <cute/util/compat.hpp>
+#include <sycl/sycl.hpp>
+
+#include <cute/tensor.hpp>
+
+#include "cutlass/util/print_error.hpp"
+#include "util.h"
+
+template <class TensorS, class TensorD, class ThreadLayoutS,
+          class ThreadLayoutD>
+void transposeKernelNaive(TensorS const S, TensorD const DT,
+                          ThreadLayoutS const tS, ThreadLayoutD const tD) {
+  using namespace cute;
+  using Element = typename TensorS::value_type;
+
+  Tensor gS = S(make_coord(_, _), compat::work_group_id::x(),
+                compat::work_group_id::y()); // (bM, bN)
+  Tensor gDT = DT(make_coord(_, _), compat::work_group_id::x(),
+                  compat::work_group_id::y()); // (bM, bN)
+
+  Tensor tSgS = local_partition(gS, ThreadLayoutS{},
+                                compat::local_id::x()); // (ThrValM, ThrValN)
+  Tensor tDgDT = local_partition(gDT, ThreadLayoutD{}, compat::local_id::x());
+
+  Tensor rmem = make_tensor_like(tSgS);
+
+  copy(tSgS, rmem);
+  copy(rmem, tDgDT);
+}
+
+template <typename Element>
+void transpose_naive(TransposeParams<Element> params) {
+
+  using namespace cute;
+  //
+  // Make Tensors
+  //
+  auto tensor_shape = make_shape(params.M, params.N);
+  auto tensor_shape_trans = make_shape(params.N, params.M);
+  auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
+  auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{});
+  Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
+  Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);
+
+  // Make a transposed view of the output
+  auto gmemLayoutDT = make_layout(tensor_shape, GenColMajor{});
+  Tensor tensor_DT = make_tensor(make_gmem_ptr(params.output), gmemLayoutDT);
+
+  //
+  // Tile tensors
+  //
+
+  using bM = Int<8>;
+  using bN = Int<512>;
+
+  auto block_shape = make_shape(bM{}, bN{});       // (bM, bN)
+  auto block_shape_trans = make_shape(bN{}, bM{}); // (bN, bM)
+
+  Tensor tiled_tensor_S =
+      tiled_divide(tensor_S, block_shape); // ((bM, bN), m', n')
+  Tensor tiled_tensor_DT =
+      tiled_divide(tensor_DT, block_shape); // ((bM, bN), m', n')
+
+  auto threadLayoutS =
+      make_layout(make_shape(Int<8>{}, Int<64>{}), LayoutRight{});
+  auto threadLayoutD =
+      make_layout(make_shape(Int<8>{}, Int<64>{}), LayoutRight{});
+
+  auto gridDim = compat::dim3(
+      size<1>(tiled_tensor_S),
+      size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n'
+  auto blockDim = compat::dim3(size(threadLayoutS));
+
+  //
+  // Launch the kernel
+  //
+  compat::launch<
+      transposeKernelNaive<decltype(tiled_tensor_S), decltype(tiled_tensor_DT),
+                           decltype(threadLayoutS), decltype(threadLayoutD)>>(
+      gridDim, blockDim, tiled_tensor_S, tiled_tensor_DT, threadLayoutS,
+      threadLayoutD);
+};
diff --git a/examples/cute/tutorial/transpose/transpose_smem.h b/examples/cute/tutorial/transpose/transpose_smem.h
new file mode 100644
index 0000000000..51aa1bddba
--- /dev/null
+++ b/examples/cute/tutorial/transpose/transpose_smem.h
@@ -0,0 +1,163 @@
+#pragma once
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <cute/util/compat.hpp>
+#include <sycl/sycl.hpp>
+
+#include <cute/tensor.hpp>
+
+#include "cutlass/util/print_error.hpp"
+#include "util.h"
+
+#include "cutlass/detail/layout.hpp"
+
+// Shared Storage for aligned addresses
+template <class Element, class SmemLayout> struct SharedStorageTranspose {
+  cute::array_aligned<Element, cute::cosize_v<SmemLayout>,
+                      cutlass::detail::alignment_for_swizzle(SmemLayout{})>
+      smem;
+};
+
+template <class TensorS, class TensorD, class SmemLayoutS, class ThreadLayoutS,
+          class SmemLayoutD, class ThreadLayoutD, int smem_size>
+void transposeSmemKernel(TensorS const S, TensorD const D,
+                         SmemLayoutS const smemLayoutS, ThreadLayoutS const tS,
+                         SmemLayoutD const smemLayoutD,
+                         ThreadLayoutD const tD) {
+  using namespace cute;
+  using Element = typename TensorS::value_type;
+
+  // Use Shared Storage structure to allocate aligned SMEM addresses.
+  using SharedStorage = SharedStorageTranspose<Element, SmemLayoutD>;
+  auto smem = compat::local_mem<Element[smem_size]>();
+  SharedStorage &shared_storage = *reinterpret_cast<SharedStorage *>(smem);
+
+  // two different views of smem
+  Tensor sS = make_tensor(make_smem_ptr(shared_storage.smem.data()),
+                          smemLayoutS); // (bM, bN)
+  Tensor sD = make_tensor(make_smem_ptr(shared_storage.smem.data()),
+                          smemLayoutD); // (bN, bM)
+
+  Tensor gS = S(make_coord(_, _), compat::work_group_id::x(),
+                compat::work_group_id::y()); // (bM, bN)
+  Tensor gD = D(make_coord(_, _), compat::work_group_id::y(),
+                compat::work_group_id::x()); // (bN, bM)
+
+  Tensor tSgS =
+      local_partition(gS, tS, compat::local_id::x()); // (ThrValM, ThrValN)
+  Tensor tSsS =
+      local_partition(sS, tS, compat::local_id::x()); // (ThrValM, ThrValN)
+  Tensor tDgD = local_partition(gD, tD, compat::local_id::x());
+  Tensor tDsD = local_partition(sD, tD, compat::local_id::x());
+
+  cute::copy(tSgS, tSsS); // LDGSTS
+
+  cp_async_fence();
+  cp_async_wait<0>();
+  syncthreads();
+
+  cute::copy(tDsD, tDgD);
+}
+
+template <typename Element, bool isSwizzled = false>
+void transpose_smem(TransposeParams<Element> params) {
+
+  using namespace cute;
+
+  //
+  // Make tensors
+  //
+  auto tensor_shape = make_shape(params.M, params.N);
+  auto tensor_shape_trans = make_shape(params.N, params.M);
+  auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
+  auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{});
+  Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
+  Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);
+
+  //
+  // Tile tensors
+  //
+
+  using bM = Int<64>;
+  using bN = Int<128>;
+
+  auto block_shape = make_shape(bM{}, bN{});       // (bM, bN)
+  auto block_shape_trans = make_shape(bN{}, bM{}); // (bN, bM)
+
+  Tensor tiled_tensor_S =
+      tiled_divide(tensor_S, block_shape); // ((bM, bN), m', n')
+  Tensor tiled_tensor_D =
+      tiled_divide(tensor_D, block_shape_trans); // ((bN, bM), n', m')
+
+  auto tileShapeS = make_layout(block_shape, LayoutRight{});
+  auto tileShapeD = make_layout(block_shape_trans, LayoutRight{});
+
+  auto smemLayoutS = tileShapeS;
+  auto smemLayoutD = composition(smemLayoutS, tileShapeD);
+  auto smemLayoutS_swizzle = composition(Swizzle<5, 0, 5>{}, tileShapeS);
+  auto smemLayoutD_swizzle = composition(smemLayoutS_swizzle, tileShapeD);
+
+  auto threadLayoutS =
+      make_layout(make_shape(Int<8>{}, Int<64>{}), LayoutRight{});
+  auto threadLayoutD =
+      make_layout(make_shape(Int<8>{}, Int<64>{}), LayoutRight{});
+
+  constexpr int smem_size =
+      int(sizeof(SharedStorageTranspose<Element, decltype(smemLayoutS)>));
+
+  //
+  // Determine grid and block dimensions
+  //
+
+  dim3 gridDim(
+      size<1>(tiled_tensor_S),
+      size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n'
+  dim3 blockDim(size(threadLayoutS)); // 256 threads
+
+  if constexpr (isSwizzled) {
+    compat::launch<transposeSmemKernel<
+        decltype(tiled_tensor_S), decltype(tiled_tensor_D),
+        decltype(smemLayoutS_swizzle), decltype(threadLayoutS),
+        decltype(smemLayoutD_swizzle), decltype(threadLayoutD), smem_size>>(
+        gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, smemLayoutS_swizzle,
+        threadLayoutS, smemLayoutD_swizzle, threadLayoutD);
+  } else {
+    compat::launch<transposeSmemKernel<
+        decltype(tiled_tensor_S), decltype(tiled_tensor_D),
+        decltype(smemLayoutS), decltype(threadLayoutS), decltype(smemLayoutD),
+        decltype(threadLayoutD), smem_size>>(
+        gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, smemLayoutS,
+        threadLayoutS, smemLayoutD, threadLayoutD);
+  }
+}
diff --git a/examples/cute/tutorial/transpose/util.h b/examples/cute/tutorial/transpose/util.h
new file mode 100644
index 0000000000..984fd12689
--- /dev/null
+++ b/examples/cute/tutorial/transpose/util.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include "../../../common/sycl_cute_common.hpp"
+#include <iomanip>
+
+template <typename T> struct TransposeParams {
+  T *__restrict__ input;
+  T *__restrict__ output;
+
+  const int M;
+  const int N;
+
+  TransposeParams(T *__restrict__ input_, T *__restrict__ output_, int M_,
+                  int N_)
+      : input(input_), output(output_), M(M_), N(N_) {}
+};
+
+template <typename T, bool isTranspose = true, bool isFMA = false,
+          bool is_random = true>
+int benchmark(void (*transpose)(TransposeParams<T> params), int M, int N,
+              int iterations = 10, bool verify = true) {
+  using namespace cute;
+
+  auto tensor_shape_S = make_shape(M, N);
+  auto tensor_shape_D = (isTranspose) ? make_shape(N, M) : make_shape(M, N);
+
+  //
+  // Allocate and initialize
+  //
+  std::vector<T> h_S(size(tensor_shape_S));
+  std::vector<T> h_D(size(tensor_shape_D));
+
+  auto d_S = compat::malloc<T>(size(tensor_shape_S));
+  auto d_D = compat::malloc<T>(size(tensor_shape_D));
+
+  if (not is_random) {
+    for (size_t i = 0; i < h_S.size(); ++i) {
+      h_S[i] = static_cast<T>(i);
+    }
+  } else {
+    random_fill<T>(h_S);
+  }
+
+  compat::memcpy<T>(d_S, h_S.data(), size(tensor_shape_S));
+
+  TransposeParams<T> params(d_S, d_D, M, N);
+
+  for (int i = 0; i < iterations; i++) {
+    auto t1 = std::chrono::high_resolution_clock::now();
+    transpose(params);
+    compat::wait_and_throw();
+    auto t2 = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double, std::milli> tDiff = t2 - t1;
+    double time_ms = tDiff.count();
+    double M_ = double(M);
+    double N_ = double(N);
+    double bytes = 2 * M_ * N_ * sizeof(T);
+
+    std::cout << "Trial " << i << " Completed in " << time_ms << "ms ("
+              << std::fixed << std::setprecision(2) << 1e-6 * bytes / time_ms
+              << " GB/s)" << std::endl;
+  }
+
+  if (verify) {
+    compat::memcpy<T>(h_D.data(), d_D, size(tensor_shape_D));
+
+    int bad = 0;
+    if constexpr (isTranspose) {
+      auto transpose_function = make_layout(tensor_shape_S, LayoutRight{});
+      for (size_t i = 0; i < h_D.size(); ++i)
+        if (h_D[i] != h_S[transpose_function(i)])
+          bad++;
+    } else {
+      for (size_t i = 0; i < h_D.size(); ++i)
+        if (h_D[i] != h_S[i])
+          bad++;
+    }
+#if 0
+    for (size_t i = 0; i < M; ++i) {
+      for (size_t j = 0; j < N; ++j) {
+        std::cout << (int)h_S[i * N + j] << "\t";
+      }
+      std::cout << std::endl;
+    }
+    std::cout << std::endl;
+    for (size_t i = 0; i < M; ++i) {
+      for (size_t j = 0; j < N; ++j) {
+        std::cout << (int)h_D[i * N + j] << "\t";
+      }
+      std::cout << std::endl;
+    }
+
+#endif
+
+    if (bad > 0) {
+      std::cout << "Validation failed. Correct values: " << h_D.size() - bad
+                << ". Incorrect values: " << bad << std::endl;
+    } else {
+      std::cout << "Validation success." << std::endl;
+    }
+  }
+  return 0;
+}
diff --git a/include/cute/util/compat/traits.hpp b/include/cute/util/compat/traits.hpp
index fcb3f3bc43..cd94efbd03 100644
--- a/include/cute/util/compat/traits.hpp
+++ b/include/cute/util/compat/traits.hpp
@@ -89,7 +89,7 @@ template <int Dim> struct range_to_item_map<sycl::nd_range<Dim>> {
   using ItemT = sycl::nd_item<Dim>;
 };
 template <int Dim> struct range_to_item_map<sycl::range<Dim>> {
-  using ItemT = sycl::item<Dim>;
+  using ItemT = sycl::item<Dim, false>;
 };
 
 template <typename T>
diff --git a/include/cutlass/platform/platform.h b/include/cutlass/platform/platform.h
index 7e3816394e..1b3b6227cd 100644
--- a/include/cutlass/platform/platform.h
+++ b/include/cutlass/platform/platform.h
@@ -866,6 +866,8 @@ struct numeric_limits<float> {
   CUTLASS_HOST_DEVICE
   static constexpr float infinity() noexcept { return bit_cast<float, int32_t>(0x7f800000);}
   CUTLASS_HOST_DEVICE
+  static constexpr float lowest() noexcept { return -bit_cast<float, int32_t>(0x7f7fffff) - 1;}
+  CUTLASS_HOST_DEVICE
   static constexpr float max() noexcept { return bit_cast<float, int32_t>(0x7f7fffff);}
   static constexpr bool is_integer = false;
   static constexpr bool has_infinity = true;

From 1f53721adeba8f60b63a76589716d345d8d59ca4 Mon Sep 17 00:00:00 2001
From: Suryaprakash Shanmugam <suryaprakash.shanmugam@intel.com>
Date: Tue, 28 Oct 2025 22:37:42 -0700
Subject: [PATCH 2/4] Add the native sycl kernel code

---
 .../{tiled_transpose_sycl.cpp => main.cpp}    |   0
 .../tutorial/transpose/transpose_sycl.cpp     | 544 ++++++++++++++++++
 2 files changed, 544 insertions(+)
 rename examples/cute/tutorial/transpose/{tiled_transpose_sycl.cpp => main.cpp} (100%)
 create mode 100644 examples/cute/tutorial/transpose/transpose_sycl.cpp

diff --git a/examples/cute/tutorial/transpose/tiled_transpose_sycl.cpp b/examples/cute/tutorial/transpose/main.cpp
similarity index 100%
rename from examples/cute/tutorial/transpose/tiled_transpose_sycl.cpp
rename to examples/cute/tutorial/transpose/main.cpp
diff --git a/examples/cute/tutorial/transpose/transpose_sycl.cpp b/examples/cute/tutorial/transpose/transpose_sycl.cpp
new file mode 100644
index 0000000000..6e4a31d1b4
--- /dev/null
+++ b/examples/cute/tutorial/transpose/transpose_sycl.cpp
@@ -0,0 +1,544 @@
+/*
+ * Objective: Transpose a square matrix tile of size 32 on a side
+ *
+ * */
+
+/*
+ * Work Group Configuration: Each work group handles one tile.
+
+   Dimensions: (TILE_DIM x BLOCK_ROWS) = (32 x 8) work-items.
+   This means each work group contains 32 * 8 = 256 work-items.
+   For a 32x32 tile (1024 elements), each work-item processes 1024 / 256 = 4
+ elements.
+
+   Example Work Group for Tile (0,0):
+   Thread indices within the work group (local_id):
+   localID[0]
+      ^
+      |
+   7  | t(0,7) t(1,7) t(2,7) ... t(31,7)
+   6  | t(0,6) t(1,6) t(2,6) ... t(31,6)
+   5  | t(0,5) t(1,5) t(2,5) ... t(31,5)
+   4  | t(0,4) t(1,4) t(2,4) ... t(31,4)
+   3  | t(0,3) t(1,3) t(2,3) ... t(31,3)
+   2  | t(0,2) t(1,2) t(2,2) ... t(31,2)
+   1  | t(0,1) t(1,1) t(2,1) ... t(31,1)
+   0  | t(0,0) t(1,0) t(2,0) ... t(31,0)  --> localId[1]: 0  1  2  ... 31
+      +------------------------------------->
+ */
+
+#include <sycl/sycl.hpp>
+
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+
+#include "benchmark.h"
+
+// size of the entire square matrix NrN
+// we still use separate variables for the sides so we can
+// think about tile and block indexing in the matrix rows/cols
+constexpr size_t N = 16384;
+constexpr size_t Nr = N;
+constexpr size_t Nc = N;
+
+// size of a single data tile that we will work with
+// we use 16 here to demonstrate bank conflicts on intel gpus
+constexpr size_t TILE_DIM = 64;
+
+// number of rows in our workgroup
+// intentionally this is a smaller number because we want to use
+// a single thread to copy 4 elements
+constexpr size_t BLOCK_ROWS = TILE_DIM / 4;
+
+constexpr size_t numIters = 100;
+
+typedef unsigned int uint;
+using T = float;
+
+template <typename AccT> auto get_accessor_pointer(const AccT &acc) {
+  return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
+}
+
+int main() {
+  std::vector<T> A(Nr * Nc);
+  std::vector<T> A_T(Nr * Nc);
+  std::vector<T> A_T_ref(Nr * Nc);
+
+  if (Nr % TILE_DIM or Nc % TILE_DIM) {
+    throw std::runtime_error("Nr and Nc must be a multiple of TILE_DIM");
+  }
+
+  if (TILE_DIM % BLOCK_ROWS) {
+    throw std::runtime_error("TILE_DIM must be a multiple of BLOCK_ROWS");
+  }
+
+  // fill the matrix and prep ref output on the host
+  for (int i = 0; i < Nr; i++)
+    for (int j = 0; j < Nc; j++)
+      A[i * Nr + j] = i * Nr + j; // data same as the linear physical index
+
+  // for the ref transpose out, flip the quickest varying index on the reads
+  for (int i = 0; i < Nr; i++)
+    for (int j = 0; j < Nc; j++)
+      A_T_ref[i * Nr + j] = j * Nr + i;
+
+  try {
+    auto q = sycl::queue{sycl::property::queue::enable_profiling{}};
+
+    std::cout << "Running on "
+              << q.get_device().get_info<sycl::info::device::name>() << "\n";
+    std::cout << "Local Memory Size: "
+              << q.get_device().get_info<sycl::info::device::local_mem_size>() /
+                     1024
+              << "KB" << std::endl;
+    std::cout
+        << "Max Work Group Size: "
+        << q.get_device().get_info<sycl::info::device::max_work_group_size>()
+        << std::endl;
+
+    sycl::range dataRange{Nr, Nc};
+    // div y dim by 4 as we use a single work-item to move 4 values
+    sycl::range globalRange{Nr / 4, Nc};
+    sycl::range localRange{BLOCK_ROWS, TILE_DIM};
+    sycl::nd_range ndRange{globalRange, localRange};
+
+    {
+      sycl::buffer h_idata{A.data(), dataRange};
+      sycl::buffer h_odata{A_T.data(), dataRange};
+
+      // Simple copy without coalescing to demonstrate its inefficiency
+      auto simple_copy = [&]() {
+        q.submit([&](sycl::handler &cgh) {
+          sycl::accessor d_idata{h_idata, cgh, sycl::read_only};
+          sycl::accessor d_odata{h_odata, cgh, sycl::write_only,
+                                 sycl::property::no_init{}};
+
+          cgh.parallel_for<class copy_gmem_direct_no_coalescing>(
+              ndRange, [=](sycl::nd_item<2> item) {
+                auto globalId = item.get_global_id();
+                auto localId = item.get_local_id();
+
+                auto row_id = item.get_group(0) * TILE_DIM + localId[0];
+                auto col_id = globalId[1];
+
+                for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+                  sycl::id dataIdx{col_id, row_id + i};
+                  d_odata[dataIdx] = d_idata[dataIdx];
+                }
+              });
+        });
+        q.wait_and_throw();
+      };
+
+      // Simple copy with coalescing used as reference for best effective
+      // bandwidth
+      auto simple_coalesced_copy = [&]() {
+        q.submit([&](sycl::handler &cgh) {
+          sycl::accessor d_idata{h_idata, cgh, sycl::read_only};
+          sycl::accessor d_odata{h_odata, cgh, sycl::write_only,
+                                 sycl::property::no_init{}};
+
+          cgh.parallel_for<class copy_gmem_direct>(
+              ndRange, [=](sycl::nd_item<2> item) {
+                auto globalId = item.get_global_id();
+                auto localId = item.get_local_id();
+
+                // get_group(0) gives the work-group id along the row dim
+                // since we need to compute the group offset here with
+                // TILE_DIM;  Just using the global id wouldn't work here
+                // because we don't have a 1:1 thread:value
+                // mapping here)
+                auto row_id = item.get_group(0) * TILE_DIM + localId[0];
+                // work-items of the fastest varying dimension (1) access
+                // consecutive memory locations so that loads and stores
+                // are coalesced
+                auto col_id = globalId[1];
+
+                for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+                  sycl::id dataIdx{row_id + i, col_id};
+                  d_odata[dataIdx] = d_idata[dataIdx];
+                }
+              });
+        });
+        q.wait_and_throw();
+      };
+
+      // Naive Transpose
+      // reads are coalesced, but writes are not
+      auto naive_transpose = [&]() {
+        q.submit([&](sycl::handler &cgh) {
+          sycl::accessor d_idata{h_idata, cgh, sycl::read_only};
+          sycl::accessor d_odata{h_odata, cgh, sycl::write_only,
+                                 sycl::property::no_init{}};
+
+          cgh.parallel_for<class naive_transpose>(
+              ndRange, [=](sycl::nd_item<2> item) {
+                auto globalId = item.get_global_id();
+                auto localId = item.get_local_id();
+
+                auto row_id = item.get_group(0) * TILE_DIM + localId[0];
+                auto col_id = globalId[1];
+
+                for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+                  sycl::id idataIdx{row_id + i, col_id};
+                  // swap the output buffer's indices to transpose it
+                  sycl::id odataIdx{col_id, row_id + i};
+                  d_odata[odataIdx] = d_idata[idataIdx];
+                }
+              });
+        });
+        q.wait_and_throw();
+      };
+
+      // Tiled copy through SMEM as a baseline for SMEM transpose
+      auto smem_copy = [&]() {
+        q.submit([&](sycl::handler &cgh) {
+          sycl::accessor d_idata{h_idata, cgh, sycl::read_only};
+          sycl::accessor d_odata{h_odata, cgh, sycl::write_only,
+                                 sycl::property::no_init{}};
+          sycl::range tileRange{TILE_DIM, TILE_DIM};
+          sycl::local_accessor<T, 2> sharedMemTile{tileRange, cgh};
+
+          cgh.parallel_for<class tiled_copy>(
+              ndRange, [=](sycl::nd_item<2> item) {
+                auto globalId = item.get_global_id();
+                auto localId = item.get_local_id();
+
+                auto groupOffset_0 = item.get_group(0) * TILE_DIM;
+                auto groupOffset_1 = item.get_group(1) * TILE_DIM;
+
+                auto row_id = groupOffset_0 + localId[0];
+                auto col_id = groupOffset_1 + localId[1];
+
+                for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+                  sycl::id dataIdx{row_id + i, col_id};
+                  sycl::id smemTileIdx{localId[0] + i, localId[1]};
+
+                  // coalesced read from gmem into smem
+                  sharedMemTile[smemTileIdx] = d_idata[dataIdx];
+                }
+
+                // We need to wait here to ensure that all work items
+                // have written to local memory before we start reading
+                // from it.
+                sycl::group_barrier(item.get_group());
+
+                for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+                  sycl::id dataIdx{row_id + i, col_id};
+                  sycl::id smemTileIdx{localId[0] + i, localId[1]};
+
+                  // coalesced write to gmem from smem
+                  d_odata[dataIdx] = sharedMemTile[smemTileIdx];
+                }
+              });
+        });
+        q.wait_and_throw();
+      };
+
+      // Coalesce reads and writes to global memory but do the strided
+      // access required for the transpose in shared local memory as it
+      // doesn't levy as much a much penalty when done in SLM compared to
+      // GMEM
+      auto smem_transpose = [&]() {
+        q.submit([&](sycl::handler &cgh) {
+          sycl::accessor d_idata{h_idata, cgh, sycl::read_only};
+          sycl::accessor d_odata{h_odata, cgh, sycl::write_only,
+                                 sycl::property::no_init{}};
+          sycl::range tileRange{TILE_DIM, TILE_DIM};
+          sycl::local_accessor<T, 2> sharedMemTile{tileRange, cgh};
+
+          cgh.parallel_for<class tiled_transpose>(
+              ndRange, [=](sycl::nd_item<2> item) {
+                auto globalId = item.get_global_id();
+                auto localId = item.get_local_id();
+
+                auto groupOffset_0 = item.get_group(0) * TILE_DIM;
+                auto groupOffset_1 = item.get_group(1) * TILE_DIM;
+
+                auto row_id = groupOffset_0 + localId[0];
+                auto col_id = groupOffset_1 + localId[1];
+
+                auto row_id_T = groupOffset_1 + localId[0];
+                auto col_id_T = groupOffset_0 + localId[1];
+
+                for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+                  sycl::id dataIdx{row_id + i, col_id};
+                  sycl::id smemTileIdx{localId[0] + i, localId[1]};
+
+                  // coalesced read from gmem into smem
+                  sharedMemTile[smemTileIdx] = d_idata[dataIdx];
+                }
+
+                // We need to wait here to ensure that all work items
+                // have written to local memory before we start reading
+                // from it.
+                sycl::group_barrier(item.get_group());
+
+                for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+                  sycl::id dataIdx{row_id_T + i, col_id_T};
+                  // this creates strided reads in smem, but the writes
+                  // to gmem are still coalesced
+                  sycl::id smemTileIdx{localId[1], localId[0] + i};
+                  d_odata[dataIdx] = sharedMemTile[smemTileIdx];
+                }
+              });
+        });
+        q.wait_and_throw();
+      };
+
+      // SMEM Transpose avoiding bank conflict by allocating TILE_DIM + 1 on
+      // SMEM column dimension, causing every element in the smem to fall in
+      // a different shared memory bank; kernel is the same as above
+      auto smem_transpose_no_bank_conflict = [&]() {
+        q.submit([&](sycl::handler &cgh) {
+          sycl::accessor d_idata{h_idata, cgh, sycl::read_only};
+          sycl::accessor d_odata{h_odata, cgh, sycl::write_only,
+                                 sycl::property::no_init{}};
+          sycl::range tileRange{TILE_DIM, TILE_DIM + 1};
+          sycl::local_accessor<T, 2> sharedMemTile{tileRange, cgh};
+
+          cgh.parallel_for<class tiled_transpose_padded_smem>(
+              ndRange, [=](sycl::nd_item<2> item) {
+                auto globalId = item.get_global_id();
+                auto localId = item.get_local_id();
+
+                auto groupOffset_0 = item.get_group(0) * TILE_DIM;
+                auto groupOffset_1 = item.get_group(1) * TILE_DIM;
+
+                auto row_id = groupOffset_0 + localId[0];
+                auto col_id = groupOffset_1 + localId[1];
+
+                for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+                  sycl::id dataIdx{row_id + i, col_id};
+                  sycl::id smemTileIdx{localId[0] + i, localId[1]};
+
+                  // coalesced read from gmem into smem
+                  sharedMemTile[smemTileIdx] = d_idata[dataIdx];
+                }
+
+                // We need to wait here to ensure that all work items
+                // have
+                // written to local memory before we start reading from
+                // it
+                sycl::group_barrier(item.get_group());
+
+                for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+                  // output tile offsets need to be swapped
+                  sycl::id dataIdx{groupOffset_1 + localId[0] + i,
+                                   groupOffset_0 + localId[1]};
+                  // this creates strided reads in smem, but the writes
+                  // to
+                  // gmem are still coalesced
+                  sycl::id smemTileIdx{localId[1], localId[0] + i};
+                  d_odata[dataIdx] = sharedMemTile[smemTileIdx];
+                }
+              });
+        });
+        q.wait_and_throw();
+      };
+
+      // // transpose using subgroup shuffle functions
+      // util::benchmark(
+      //     [&]() {
+      //       q.submit([&](sycl::handler &cgh) {
+      //         sycl::accessor d_idata{h_idata, cgh, sycl::read_only};
+      //         sycl::accessor d_odata{h_odata, cgh, sycl::write_only,
+      //                                sycl::property::no_init{}};
+      //
+      //         // this kernel requires the tile size to be equal to the
+      //         // sub-group size used so we can use the sub-group shuffle
+      //         // functions
+      //         constexpr size_t BLOCK_SIZE = 16;
+      //         cgh.parallel_for<class tiled_transpose_subgroup_shuffle>(
+      //             sycl::nd_range<2>(sycl::range<2>(Nr / BLOCK_SIZE, Nc),
+      //                               sycl::range<2>(1, BLOCK_SIZE)),
+      //             [=](sycl::nd_item<2> it)
+      //             [[sycl::reqd_sub_group_size(16)]]
+      //             {
+      //               auto localId = it.get_local_id();
+      //               int gi = it.get_group(0);
+      //               int gj = it.get_group(1);
+      //
+      //               auto sg = it.get_sub_group();
+      //               uint sgId = sg.get_local_id()[0];
+      //
+      //               float bcol[BLOCK_SIZE];
+      //               int ai = BLOCK_SIZE * gi;
+      //               int aj = BLOCK_SIZE * gj;
+      //
+      //               for (uint k = 0; k < BLOCK_SIZE; k++) {
+      //                 // load columns of the matrix tile into the
+      //                 subgroup bcol[k] =
+      //                 sg.load(get_accessor_pointer(d_idata) +
+      //                                   (ai + k) * Nc + aj);
+      //               }
+      //
+      //               // no barriers required here because the threads of a
+      //               // sub-group execute concurrently, so all columns of
+      //               the
+      //               // matrix were loaded into bcol already
+      //
+      //               float tcol[BLOCK_SIZE];
+      //               for (uint n = 0; n < BLOCK_SIZE; n++) {
+      //                 if (sgId == n) {
+      //                   for (uint k = 0; k < BLOCK_SIZE; k++) {
+      //                     // returns the value of bcol[n] from the k-th
+      //                     // work-item
+      //                     tcol[k] = sycl::select_from_group(sg, bcol[n],
+      //                     k);
+      //                   }
+      //                 }
+      //               }
+      //
+      //               for (uint k = 0; k < BLOCK_SIZE; k++) {
+      //                 sg.store(get_accessor_pointer(d_odata) + (aj + k) *
+      //                 Nc
+      //                 +
+      //                              ai,
+      //                          tcol[k]);
+      //               }
+      //             });
+      //       });
+      //       q.wait_and_throw();
+      //     },
+      //     numIters, Nc * Nr,
+      //     "Tiled GMEM Transpose with sub-group shuffle functions");
+
+      // Tiled Transpose using the sub-group shuffle function
+      // where loads and stores are to shared local memory
+      auto tiled_subgroup_shuffle = [&]() {
+        q.submit([&](sycl::handler &cgh) {
+          sycl::accessor d_idata{h_idata, cgh, sycl::read_only};
+          sycl::accessor d_odata{h_odata, cgh, sycl::write_only,
+                                 sycl::property::no_init{}};
+          // sub-group size == data block size in smem to be transposed
+          constexpr size_t BLOCK_SIZE = 16;
+          sycl::range tileRange{TILE_DIM, TILE_DIM};
+          sycl::local_accessor<T, 2> sMemTile{tileRange, cgh};
+          sycl::local_accessor<T, 2> sMemTileTransposed{tileRange, cgh};
+
+          cgh.parallel_for<class tiled_transpose_smem_subgroup_shuffle>(
+              ndRange,
+              [=](sycl::nd_item<2> item) [[sycl::reqd_sub_group_size(16)]] {
+                auto globalId = item.get_global_id();
+                auto localId = item.get_local_id();
+
+                auto groupOffset_0 = item.get_group(0) * TILE_DIM;
+                auto groupOffset_1 = item.get_group(1) * TILE_DIM;
+
+                auto row_id = groupOffset_0 + localId[0];
+                auto col_id = groupOffset_1 + localId[1];
+
+                for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+                  sycl::id dataIdx{row_id + i, col_id};
+                  sycl::id smemTileIdx{localId[0] + i, localId[1]};
+
+                  // coalesced read from gmem into smem
+                  sMemTile[smemTileIdx] = d_idata[dataIdx];
+                }
+
+                // Now sMem holds a 32x32 size data tile that we
+                // transpose with sub-group select function
+                // Each 1x16 sub-group of work-item can transpose one
+                // 16x16 tile in the SMEM. So we need to loop over the
+                // process two times to transpose an entire 32x32 tile
+                {
+                  auto sg = item.get_sub_group();
+                  uint sgId = sg.get_local_id()[0];
+
+                  float bcol[BLOCK_SIZE];
+                  int ai = BLOCK_SIZE * 0;
+                  int aj = BLOCK_SIZE * sg.get_group_id();
+
+                  for (uint k = 0; k < BLOCK_SIZE; k++) {
+                    // load columns of the matrix
+                    // tile into the subgroup
+                    bcol[k] = sg.load(get_accessor_pointer(sMemTile) +
+                                      (ai + k) * TILE_DIM + aj);
+                  }
+
+                  float tcol[BLOCK_SIZE];
+                  for (uint n = 0; n < BLOCK_SIZE; n++) {
+                    if (sgId == n) {
+                      for (uint k = 0; k < BLOCK_SIZE; k++) {
+                        // returns the value of bcol[n] from the k-th
+                        // work-item
+                        tcol[k] = sycl::select_from_group(sg, bcol[n], k);
+                      }
+                    }
+                  }
+
+                  for (uint k = 0; k < BLOCK_SIZE; k++) {
+                    sg.store(get_accessor_pointer(sMemTileTransposed) +
+                                 (aj + k) * TILE_DIM + ai,
+                             tcol[k]);
+                  }
+                }
+
+                {
+                  auto sg = item.get_sub_group();
+                  uint sgId = sg.get_local_id()[0];
+
+                  float bcol[BLOCK_SIZE];
+                  int ai = BLOCK_SIZE * 1;
+                  int aj = BLOCK_SIZE * sg.get_group_id();
+
+                  for (uint k = 0; k < BLOCK_SIZE; k++) {
+                    // load columns of the matrix
+                    // tile into the subgroup
+                    bcol[k] = sg.load(get_accessor_pointer(sMemTile) +
+                                      (ai + k) * TILE_DIM + aj);
+                  }
+
+                  float tcol[BLOCK_SIZE];
+                  for (uint n = 0; n < BLOCK_SIZE; n++) {
+                    if (sgId == n) {
+                      for (uint k = 0; k < BLOCK_SIZE; k++) {
+                        // returns the value of bcol[n] from the k-th
+                        // work-item
+                        tcol[k] = sycl::select_from_group(sg, bcol[n], k);
+                      }
+                    }
+                  }
+
+                  for (uint k = 0; k < BLOCK_SIZE; k++) {
+                    sg.store(get_accessor_pointer(sMemTileTransposed) +
+                                 (aj + k) * TILE_DIM + ai,
+                             tcol[k]);
+                  }
+                }
+
+                // We need to wait here to ensure that all work items
+                // have written to local memory before we start reading
+                // from it.
+                sycl::group_barrier(item.get_group());
+
+                for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
+                  sycl::id dataIdx{groupOffset_1 + localId[0] + i,
+                                   groupOffset_0 + localId[1]};
+                  sycl::id smemTileIdx{localId[0] + i, localId[1]};
+
+                  // coalesced write to gmem from smem
+                  d_odata[dataIdx] = sMemTileTransposed[smemTileIdx];
+                }
+              });
+        });
+        q.wait_and_throw();
+      };
+      util::benchmark(simple_copy, numIters, Nc * Nr,
+                      "Simple Non-Coalesced Tiled Copy");
+      util::benchmark(simple_coalesced_copy, numIters, Nc * Nr,
+                      "Simple Tiled Copy");
+      util::benchmark(naive_transpose, numIters, Nc * Nr, "Naive Transpose");
+      util::benchmark(smem_copy, numIters, Nc * Nr, "Tiled SMEM Copy");
+      util::benchmark(smem_transpose, numIters, Nc * Nr,
+                      "Tiled SMEM Transpose");
+      util::benchmark(smem_transpose_no_bank_conflict, numIters, Nc * Nr,
+                      "Tiled SMEM Transpose avoiding Bank Conflict");
+      util::benchmark(tiled_subgroup_shuffle, numIters, Nc * Nr,
+                      "Tiled SMEM Transpose with sub-group shuffle functions");
+    }
+  } catch (const sycl::exception &e) {
+    std::cout << "Exception caught: " << e.what() << std::endl;
+  }

From 3578f8e5ef50da67a099feedfbac0830a408db94 Mon Sep 17 00:00:00 2001
From: Suryaprakash Shanmugam <suryaprakash.shanmugam@intel.com>
Date: Tue, 28 Oct 2025 22:38:58 -0700
Subject: [PATCH 3/4] Add transposed load kernel through the new Xe Copy Atoms

---
 examples/cute/tutorial/CMakeLists.txt         |   2 +-
 .../transpose/block_2d_transposed_copy.h      | 148 ++++++++++++++++
 .../cute/tutorial/transpose/copy_direct.h     | 158 ++++++++++--------
 examples/cute/tutorial/transpose/copy_smem.h  |   8 +-
 examples/cute/tutorial/transpose/main.cpp     |   5 +
 .../cute/tutorial/transpose/transpose_naive.h |   4 -
 .../cute/tutorial/transpose/transpose_smem.h  |  12 +-
 7 files changed, 254 insertions(+), 83 deletions(-)
 create mode 100644 examples/cute/tutorial/transpose/block_2d_transposed_copy.h

diff --git a/examples/cute/tutorial/CMakeLists.txt b/examples/cute/tutorial/CMakeLists.txt
index d868a21e1b..cff90792ad 100644
--- a/examples/cute/tutorial/CMakeLists.txt
+++ b/examples/cute/tutorial/CMakeLists.txt
@@ -47,7 +47,7 @@ if (CUTLASS_ENABLE_SYCL)
 
   cutlass_example_add_executable(
     cute_tutorial_tiled_transpose
-    transpose/tiled_transpose_sycl.cpp
+    transpose/main.cpp
   )
 
   cutlass_example_add_executable(
diff --git a/examples/cute/tutorial/transpose/block_2d_transposed_copy.h b/examples/cute/tutorial/transpose/block_2d_transposed_copy.h
new file mode 100644
index 0000000000..d061df9af0
--- /dev/null
+++ b/examples/cute/tutorial/transpose/block_2d_transposed_copy.h
@@ -0,0 +1,148 @@
+#pragma once
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#include <cute/tensor.hpp>
+#include <cute/util/compat.hpp>
+#include <sycl/ext/intel/experimental/grf_size_properties.hpp>
+#include <sycl/sycl.hpp>
+
+#include "cutlass/util/print_error.hpp"
+#include "util.h"
+
+template <class TensorS, class TensorD, class BlockShape, class BlockShapeTrans,
+          class ThreadLayout>
+void block2DTransposedLoadKernel(TensorS const S, TensorD const DT,
+                                 BlockShape const block_shape,
+                                 BlockShapeTrans const block_shape_transposed,
+                                 ThreadLayout const thread_layout) {
+  using namespace cute;
+  using Element = typename TensorS::value_type;
+
+  /* get workgroup and local ids */
+  auto item = sycl::ext::oneapi::this_work_item::get_nd_item<2>();
+  auto wg_m = int(item.get_group(0));
+  auto wg_n = int(item.get_group(1));
+  auto local_id = int(item.get_local_id(0));
+
+  /* proxy coordinate tensor */
+  Tensor cS = make_identity_tensor(S.shape());   // (M,N)
+  Tensor cDT = make_identity_tensor(DT.shape()); // (N,M)
+
+  auto wg_coord = make_coord(wg_m, wg_n);
+  auto wg_coord_transposed = make_coord(wg_n, wg_m);
+
+  // Tensor data = ... // (  M,  N) Tensor cta_data = local_tile(data,
+  // Shape<16, 16>{}, make_coord(blockIdx.x,blockIdx.y));  // (_32,_64)
+  Tensor gS = local_tile(cS, block_shape, wg_coord); // (BLK_M,BLK_N)
+  Tensor gDT = local_tile(cDT, block_shape_transposed,
+                          wg_coord_transposed); // (BLK_N,BLK_M);
+
+  constexpr int CopyBits = sizeof_bits_v<Element>;
+  auto transposed_load_op = XE_LOAD_2D_TRANSPOSE<CopyBits, 8, 8>{};
+  auto store_op = XE_STORE_2D<CopyBits, 8, 8>{};
+
+  /* Slice TiledCopy operations to thread (work-item) level */
+  auto transpose_S = make_block_2d_copy(transposed_load_op, S);
+  auto thr_transpose_S = transpose_S.get_slice(local_id);
+
+  auto store_DT = make_block_2d_copy(store_op, DT);
+  auto thr_copy_DT = store_DT.get_slice(local_id);
+
+  /* Register fragments for transposed copy */
+  auto tSrS = thr_transpose_S.partition_sg_fragment_D(gS);
+  auto tDrD = thr_copy_DT.partition_sg_fragment_D(gDT);
+
+  /* Partition global tensor (proxies) for copies */
+  Tensor tSgS = thr_transpose_S.partition_S(gS);
+  Tensor tDgD = thr_copy_DT.partition_D(gDT);
+
+  // if ( cute::thread(0, 0)){
+  //     print(tSgS);print("\n");
+  //     print(tSrS);print("\n");
+  //     print(tDgD);print("\n");
+  // }
+
+  copy(transpose_S, tSgS, tSrS);
+  // copy(tSrS, tDrD);
+  copy(store_DT, tSrS, tDgD);
+}
+
+class TransposeCuteName;
+template <typename Element>
+void block_2d_transposed_copy(TransposeParams<Element> params) {
+
+  using namespace cute;
+  //
+  // Make Tensors
+  //
+  auto tensor_shape = make_shape(params.M, params.N);
+  auto tensor_shape_trans = make_shape(params.N, params.M);
+  auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
+  auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{});
+  Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
+  Tensor tensor_DT = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);
+
+  // Make a transposed view of the output
+  // auto gmemLayoutDT = make_layout(tensor_shape, GenColMajor{});
+  // Tensor tensor_DT = make_tensor(make_gmem_ptr(params.output), gmemLayoutDT);
+
+  sycl::queue Q;
+
+  //
+  // Tile tensors
+  //
+
+  using bM = Int<32>;
+  using bN = Int<8>;
+
+  auto block_shape = make_shape(bM{}, bN{});       // (bM, bN)
+  auto block_shape_trans = make_shape(bN{}, bM{}); // (bN, bM)
+
+  sycl::range<2> local = {bM{}, 1};
+  sycl::range<2> global = {local[0] * ceil_div(shape<0>(tensor_S), bM{}),
+                           local[1] * ceil_div(shape<1>(tensor_S), bN{})};
+
+  auto threadLayout = make_layout(make_shape(bM{}, Int<1>{}), LayoutRight{});
+
+  namespace syclex = sycl::ext::oneapi::experimental;
+  namespace intelex = sycl::ext::intel::experimental;
+
+  syclex::properties kernel_props{syclex::sub_group_size<16>,
+                                  intelex::grf_size<256>};
+
+  auto event = Q.parallel_for<TransposeCuteName>(
+      sycl::nd_range<2>(global, local), kernel_props, [=](auto) {
+        block2DTransposedLoadKernel(tensor_S, tensor_DT, block_shape,
+                                    block_shape_trans, threadLayout);
+      });
+};
diff --git a/examples/cute/tutorial/transpose/copy_direct.h b/examples/cute/tutorial/transpose/copy_direct.h
index 10861b360e..a4d255bfd4 100644
--- a/examples/cute/tutorial/transpose/copy_direct.h
+++ b/examples/cute/tutorial/transpose/copy_direct.h
@@ -51,6 +51,22 @@ void copy_kernel(TensorS S, TensorD D, ThreadLayout) {
   using namespace cute;
 
   // Slice the tiled tensors
+  // This line slices the tiled tensor S to get the tile for the current work
+  // group. S is a 3D tensor with layout ((M, N), m', n') where:
+  //   - (M, N) is the block/tile shape (first mode)
+  //   - m' is the number of tiles in the M dimension (second mode)
+  //   - n' is the number of tiles in the N dimension (third mode)
+  //
+  // The indexing S(make_coord(_, _), x, y) selects:
+  //   - make_coord(_, _): Takes all elements from the first mode (M, N), i.e.,
+  //   the entire tile
+  //   - compat::work_group_id::x(): Selects the x-th tile along the m'
+  //   dimension
+  //   - compat::work_group_id::y(): Selects the y-th tile along the n'
+  //   dimension
+  //
+  // Result: A 2D tensor of shape (BlockShape_M, BlockShape_N) corresponding to
+  // the tile assigned to the current work group.
   Tensor tile_S = S(make_coord(_, _), compat::work_group_id::x(),
                     compat::work_group_id::y()); // (BlockShape_M, BlockShape_N)
   Tensor tile_D = D(make_coord(_, _), compat::work_group_id::x(),
@@ -64,78 +80,78 @@ void copy_kernel(TensorS S, TensorD D, ThreadLayout) {
       tile_S, ThreadLayout{}, compat::local_id::x()); // (ThrValM, ThrValN)
   Tensor thr_tile_D = local_partition(
       tile_D, ThreadLayout{}, compat::local_id::x()); // (ThrValM, ThrValN)
-                                                      //
-
-  // Construct a register-backed Tensor with the same shape as each thread's
-  // partition Use make_tensor to try to match the layout of thr_tile_S
-  Tensor fragment = make_tensor_like(thr_tile_S); // (ThrValM, ThrValN)
-
-  // Copy from GMEM to RMEM and from RMEM to GMEM
-  copy(thr_tile_S, fragment);
-  copy(fragment, thr_tile_D);
-}
-
-template <typename Element> void copy_direct(TransposeParams<Element> params) {
-  //
-  // Given a 2D shape, perform an efficient copy
-  //
-
-  using namespace cute;
-
-  //
-  // Make tensors
-  //
-  auto tensor_shape = make_shape(params.M, params.N);
-  auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
-  auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{});
-  Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
-  Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);
-
-  //
-  // Tile tensors
-  //
 
-  // Define a statically sized block (M, N).
-  // Note, by convention, capital letters are used to represent static modes.
-  auto block_shape = make_shape(Int<1>{}, Int<16384>{});
+    // Construct a register-backed Tensor with the same shape as each thread's
+    // partition Use make_tensor to try to match the layout of thr_tile_S
+    Tensor fragment = make_tensor_like(thr_tile_S); // (ThrValM, ThrValN)
 
-  if ((size<0>(tensor_shape) % size<0>(block_shape)) ||
-      (size<1>(tensor_shape) % size<1>(block_shape))) {
-    std::cerr << "The tensor shape must be divisible by the block shape."
-              << std::endl;
-  }
-  // Equivalent check to the above
-  if (not evenly_divides(tensor_shape, block_shape)) {
-    std::cerr << "Expected the block_shape to evenly divide the tensor shape."
-              << std::endl;
+    // Copy from GMEM to RMEM and from RMEM to GMEM
+    copy(thr_tile_S, fragment);
+    copy(fragment, thr_tile_D);
   }
 
-  // Tile the tensor (m, n) ==> ((M, N), m', n') where (M, N) is the static tile
-  // shape, and modes (m', n') correspond to the number of tiles.
-  //
-  // These will be used to determine the CUDA kernel grid dimensions.
-  Tensor tiled_tensor_S =
-      tiled_divide(tensor_S, block_shape); // ((M, N), m', n')
-  Tensor tiled_tensor_D =
-      tiled_divide(tensor_D, block_shape); // ((M, N), m', n')
-
-  // Thread arrangement
-  Layout thr_layout =
-      make_layout(make_shape(Int<1>{}, Int<1024>{}), LayoutRight{});
-
-  //
-  // Determine grid and block dimensions
-  //
-
-  auto gridDim = compat::dim3(
-      size<1>(tiled_tensor_S),
-      size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n'
-  auto blockDim = compat::dim3(size(thr_layout));
-
-  //
-  // Launch the kernel
-  //
-  compat::launch<copy_kernel<decltype(tiled_tensor_S), decltype(tiled_tensor_D),
-                             decltype(thr_layout)>>(
-      gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, thr_layout);
-}
+  template <typename Element>
+  void copy_direct(TransposeParams<Element> params) {
+    //
+    // Given a 2D shape, perform an efficient copy
+    //
+
+    using namespace cute;
+
+    //
+    // Make tensors
+    //
+    auto tensor_shape = make_shape(params.M, params.N);
+    auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
+    auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{});
+    Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
+    Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);
+
+    //
+    // Tile tensors
+    //
+
+    // Define a statically sized block (M, N).
+    // Note, by convention, capital letters are used to represent static modes.
+    auto block_shape = make_shape(Int<1>{}, Int<16384>{});
+
+    if ((size<0>(tensor_shape) % size<0>(block_shape)) ||
+        (size<1>(tensor_shape) % size<1>(block_shape))) {
+      std::cerr << "The tensor shape must be divisible by the block shape."
+                << std::endl;
+    }
+    // Equivalent check to the above
+    if (not evenly_divides(tensor_shape, block_shape)) {
+      std::cerr << "Expected the block_shape to evenly divide the tensor shape."
+                << std::endl;
+    }
+
+    // Tile the tensor (m, n) ==> ((M, N), m', n') where (M, N) is the static
+    // tile shape, and modes (m', n') correspond to the number of tiles.
+    //
+    // These will be used to determine the CUDA kernel grid dimensions.
+    Tensor tiled_tensor_S =
+        tiled_divide(tensor_S, block_shape); // ((M, N), m', n')
+    Tensor tiled_tensor_D =
+        tiled_divide(tensor_D, block_shape); // ((M, N), m', n')
+
+    // Thread arrangement
+    Layout thr_layout =
+        make_layout(make_shape(Int<1>{}, Int<1024>{}), LayoutRight{});
+
+    //
+    // Determine grid and block dimensions
+    //
+
+    auto gridDim = compat::dim3(
+        size<1>(tiled_tensor_S),
+        size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n'
+    auto blockDim = compat::dim3(size(thr_layout));
+
+    //
+    // Launch the kernel
+    //
+    compat::launch<copy_kernel<decltype(tiled_tensor_S),
+                               decltype(tiled_tensor_D), decltype(thr_layout)>>(
+        gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, thr_layout);
+  }
diff --git a/examples/cute/tutorial/transpose/copy_smem.h b/examples/cute/tutorial/transpose/copy_smem.h
index 18f3fef90d..05399f8b64 100644
--- a/examples/cute/tutorial/transpose/copy_smem.h
+++ b/examples/cute/tutorial/transpose/copy_smem.h
@@ -62,10 +62,10 @@ void copySmemKernel(TensorS const S, TensorD const D, ThreadLayout,
   Tensor gS = S(make_coord(_, _), compat::work_group_id::x(),
                 compat::work_group_id::y()); // (bM, bN)
   Tensor gD = D(make_coord(_, _), compat::work_group_id::x(),
-                compat::work_group_id::y()); // (bN, bM)
+                compat::work_group_id::y()); // (bM, bN)
 
   Tensor sS = make_tensor(make_smem_ptr(shared_storage.smem.data()),
-                          SmemLayout{}); // (bN, bM)
+                          SmemLayout{}); // (bM, bN)
 
   auto tiled_copy_load = make_tiled_copy(
       Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
@@ -83,7 +83,7 @@ void copySmemKernel(TensorS const S, TensorD const D, ThreadLayout,
   Tensor tSgS = thr_copy_load.partition_S(gS);
   Tensor tSsS = thr_copy_load.partition_D(sS);
   //
-  Tensor tDsS = thr_copy_store.partition_D(sS);
+  Tensor tDsD = thr_copy_store.partition_S(sS);
   Tensor tDgD = thr_copy_store.partition_D(gD);
 
   copy(tiled_copy_load, tSgS, tSsS);
@@ -92,7 +92,7 @@ void copySmemKernel(TensorS const S, TensorD const D, ThreadLayout,
   cp_async_wait<0>();
   syncthreads();
   //
-  copy(tiled_copy_store, tDsS, tDgD);
+  copy(tiled_copy_store, tDsD, tDgD);
 }
 
 template <typename Element> void copy_smem(TransposeParams<Element> params) {
diff --git a/examples/cute/tutorial/transpose/main.cpp b/examples/cute/tutorial/transpose/main.cpp
index c179fa6485..b4c4e61275 100644
--- a/examples/cute/tutorial/transpose/main.cpp
+++ b/examples/cute/tutorial/transpose/main.cpp
@@ -1,3 +1,4 @@
+#include "block_2d_transposed_copy.h"
 #include "copy_direct.h"
 #include "copy_smem.h"
 #include "transpose_naive.h"
@@ -25,5 +26,9 @@ int main(int argc, char const **argv) {
   printf("\nTranspose through SMEM.:\n");
   benchmark<Element>(transpose_smem<Element>, M, N, iterations);
 
+  printf("Block 2d Transposed load\n");
+  benchmark<Element, true, false, false>(block_2d_transposed_copy<Element>, M,
+                                          N, iterations);
+
   return 0;
 }
diff --git a/examples/cute/tutorial/transpose/transpose_naive.h b/examples/cute/tutorial/transpose/transpose_naive.h
index d73203a1af..70aed2cdf3 100644
--- a/examples/cute/tutorial/transpose/transpose_naive.h
+++ b/examples/cute/tutorial/transpose/transpose_naive.h
@@ -69,11 +69,8 @@ void transpose_naive(TransposeParams<Element> params) {
   // Make Tensors
   //
   auto tensor_shape = make_shape(params.M, params.N);
-  auto tensor_shape_trans = make_shape(params.N, params.M);
   auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
-  auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{});
   Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
-  Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);
 
   // Make a transposed view of the output
   auto gmemLayoutDT = make_layout(tensor_shape, GenColMajor{});
@@ -87,7 +84,6 @@ void transpose_naive(TransposeParams<Element> params) {
   using bN = Int<512>;
 
   auto block_shape = make_shape(bM{}, bN{});       // (bM, bN)
-  auto block_shape_trans = make_shape(bN{}, bM{}); // (bN, bM)
 
   Tensor tiled_tensor_S =
       tiled_divide(tensor_S, block_shape); // ((bM, bN), m', n')
diff --git a/examples/cute/tutorial/transpose/transpose_smem.h b/examples/cute/tutorial/transpose/transpose_smem.h
index 51aa1bddba..b251df97f2 100644
--- a/examples/cute/tutorial/transpose/transpose_smem.h
+++ b/examples/cute/tutorial/transpose/transpose_smem.h
@@ -44,9 +44,7 @@
 
 // Shared Storage for aligned addresses
 template <class Element, class SmemLayout> struct SharedStorageTranspose {
-  cute::array_aligned<Element, cute::cosize_v<SmemLayout>,
-                      cutlass::detail::alignment_for_swizzle(SmemLayout{})>
-      smem;
+  cute::array_aligned<Element, cute::cosize_v<SmemLayout>> smem;
 };
 
 template <class TensorS, class TensorD, class SmemLayoutS, class ThreadLayoutS,
@@ -123,7 +121,15 @@ void transpose_smem(TransposeParams<Element> params) {
   auto tileShapeS = make_layout(block_shape, LayoutRight{});
   auto tileShapeD = make_layout(block_shape_trans, LayoutRight{});
 
+  // smemLayoutS uses the same layout as tileShapeS (row-major layout of shape (bM, bN))
+  // This represents how data from the source tile is stored in shared memory
   auto smemLayoutS = tileShapeS;
+
+  // smemLayoutD composes smemLayoutS with tileShapeD to create the transposed layout
+  // composition(smemLayoutS, tileShapeD) means: apply tileShapeD's coordinate mapping first,
+  // then apply smemLayoutS's layout. Since tileShapeD has shape (bN, bM), this effectively
+  // creates a layout that maps (bN, bM) coordinates through the (bM, bN) storage, achieving
+  // the transpose operation when reading from shared memory in the opposite dimension order
   auto smemLayoutD = composition(smemLayoutS, tileShapeD);
   auto smemLayoutS_swizzle = composition(Swizzle<5, 0, 5>{}, tileShapeS);
   auto smemLayoutD_swizzle = composition(smemLayoutS_swizzle, tileShapeD);

From 35b1308f9bbfd1d4f8ff2548652e80e1abfb33d1 Mon Sep 17 00:00:00 2001
From: Suryaprakash Shanmugam <suryaprakash.shanmugam@intel.com>
Date: Wed, 5 Nov 2025 22:43:28 -0800
Subject: [PATCH 4/4] Add block2dcopy test code

---
 .../cute/tutorial/transpose/block_2d_copy.h   | 193 ++++++++++++++++++
 .../transpose/block_2d_transposed_copy.h      | 148 --------------
 2 files changed, 193 insertions(+), 148 deletions(-)
 create mode 100644 examples/cute/tutorial/transpose/block_2d_copy.h
 delete mode 100644 examples/cute/tutorial/transpose/block_2d_transposed_copy.h

diff --git a/examples/cute/tutorial/transpose/block_2d_copy.h b/examples/cute/tutorial/transpose/block_2d_copy.h
new file mode 100644
index 0000000000..696edeba31
--- /dev/null
+++ b/examples/cute/tutorial/transpose/block_2d_copy.h
@@ -0,0 +1,193 @@
+#pragma once
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#include <cute/tensor.hpp>
+#include <cute/util/compat.hpp>
+#include <sycl/ext/intel/experimental/grf_size_properties.hpp>
+#include <sycl/sycl.hpp>
+
+#include "cute/stride.hpp"
+#include "cute/swizzle_layout.hpp"
+#include "cutlass/util/print_error.hpp"
+#include "util.h"
+
+template <class TensorS, class BlockShape, class TVLayout>
+void block2DCopyKernel(TensorS const S,
+                       BlockShape const block_shape,
+                       TVLayout const sv_layout) {
+  using namespace cute;
+  using Element = typename TensorS::value_type;
+
+  // get the m,n workgroup ids (thread-block indices)
+  // and the local id (threadIdx.x) of this thread */
+  auto item = sycl::ext::oneapi::this_work_item::get_nd_item<2>();
+  auto wg_m = int(item.get_group(0));
+  auto wg_n = int(item.get_group(1));
+  auto local_id = int(item.get_local_id(0));
+
+  // create a coordinate tensor of the input matrix;
+  // to be used in copy atom operations
+  Tensor cS = make_identity_tensor(S.shape());   // (M,N)
+
+  // create a wg coordinate to slice the input matrix tile
+  auto wg_coord = make_coord(wg_m, wg_n);
+
+  Tensor gS = local_tile(cS, block_shape, wg_coord); // (BLK_M,BLK_N)
+
+  constexpr int CopyBits = sizeof_bits_v<Element>;
+  constexpr int Width = 8;
+  constexpr int Height = 8;
+
+  // auto transposed_load_op = XE_LOAD_2D_TRANSPOSE<CopyBits, Width, Height>{};
+  auto copy_op = XE_LOAD_2D<CopyBits, Width, Height>{};
+
+  auto S_stride = S.stride();
+  auto x_mode = find_x_mode(S_stride);
+  auto y_mode = find_y_mode(S_stride);
+
+  using CopyOp = decltype(copy_op);
+  using XMode = decltype(x_mode);
+  using YMode = decltype(y_mode);
+
+  // Divide coordinate codomain into copy tiles.
+  auto op_tile = Int<Width>{}  * E<XMode::value>{}
+               + Int<Height>{} * E<YMode::value>{};
+  auto atom_shape = shape_div(block_shape, op_tile);
+
+  /* Slice TiledCopy operations to thread (work-item) level */
+  auto copy_S = make_block_2d_copy<Element>(copy_op,
+                                            S_stride,
+                                            x_mode,
+                                            y_mode,
+                                            atom_shape,
+                                            sv_layout.layout());
+  auto thr_copy_S = copy_S.get_slice(local_id);
+
+  /* Register fragments for transposed copy */
+  auto tSrS = thr_copy_S.partition_sg_fragment_D(gS);
+
+  /* Partition global tensor (proxies) for copies */
+  Tensor tSgS = thr_copy_S.partition_S(gS);
+
+  if (thread0()){
+      print("block_shape: "); print(block_shape); print("\n");
+      print("atom_shape: "); print(atom_shape); print("\n");
+
+      print("S: "); print(S);print("\n");
+      print("cS: "); print(cS);print("\n");
+      print("gS: ");print(gS);print("\n\n");
+
+      print("transpose_S: "); print(copy_S);print("\n\n");
+      print("thr_transpose_S: "); print(thr_copy_S);print("\n\n");
+      print("tSgS: ");print(tSgS);print("\n");
+      print("tSrS: "); print(tSrS);print("\n\n");
+  }
+
+  copy(copy_S, tSgS, tSrS);
+}
+
+class CopyCuteName;
+template <typename Element>
+void block_2d_copy(TransposeParams<Element> params) {
+
+  using namespace cute;
+  //
+  // Make Tensors
+  //
+  auto tensor_shape = make_shape(params.M, params.N);
+  auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
+  Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
+
+  sycl::queue Q;
+
+  // Tile tensors
+  //
+
+  using bM = Int<8>;
+  using bN = Int<8>;
+
+  auto block_shape = make_shape(bM{}, bN{});       // (bM, bN)
+
+  sycl::range<2> local = {1, 16}; // 1 sub-groups; keep the subgroup contiguous in the x-axis
+  sycl::range<2> global = {local[0] * ceil_div(shape<0>(tensor_S), bM{}),
+                           local[1] * ceil_div(shape<1>(tensor_S), bN{})};
+
+  // Create a mapping from (S1, V64) -> (M8, N8)
+  // Each sub-group owns the copy for 64 elements through the 8x8 vectorized copy atom
+  // Here we just have 1 sub-group to perform a copy of a 8x8 block where each work-item
+  // is responsible to copy 4 elements
+  constexpr int SubgroupSize = 16;
+  // constexpr int NumSubgroups = get<0>(block_shape);  // 1
+  constexpr int NumSubgroups = 1;
+
+  // In NVIDIA-CuTe, we use the tv-layout to index into the thread-mode to
+  // achieve the mapping from thread idx to all the linear value indices that the
+  // thread is responsible for.
+  // Similarly, we aim to achieve a mapping from subgroup idx to linear value indices.
+  // Contiguous thread index within a subgroup maps to mode-1 (or x-axis) because the
+  // input layout is contiguous on mode-1 for coalesced load-store
+  // SGV layout map subgroup & value index to (8, 8) logical tile
+  auto sg_shape = make_shape(Int<NumSubgroups>{}, _1{});
+  using sg_layout = decltype(make_layout(sg_shape, Stride<_1, _0>{}));
+  using val_layout = decltype(Layout<Shape<_8, _8>, Stride<_8,_1>>{});
+  // Layout for subgroups tiling the workgroup tile
+
+  // Following was taken from a make_tiled_copy overload that computes
+  // the equivalent of this in CuTe DSL
+  // tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout)
+
+  // Take the raked_products to compute the Layout_MN
+  // (M,N) -> (thr_idx, val_idx)
+  auto layout_mn = raked_product(sg_layout{}, val_layout{});
+  // (thr_idx, val_idx) -> (M,N)
+  auto sv_layout = right_inverse(layout_mn).with_shape(make_shape(size(sg_layout{}), size(val_layout{})));
+
+#if 1
+print("sg_layout: "); print(sg_layout{}); print("\n");
+print("val_layout: "); print(val_layout{}); print("\n");
+print("layout_mn : "); print(layout_mn);  print("\n");
+print("sv_layout: "); print(sv_layout); print("\n");
+#endif
+
+  namespace syclex = sycl::ext::oneapi::experimental;
+  namespace intelex = sycl::ext::intel::experimental;
+
+  syclex::properties kernel_props{syclex::sub_group_size<16>,
+                                  intelex::grf_size<256>};
+
+  auto event = Q.parallel_for<CopyCuteName>(
+      sycl::nd_range<2>(global, local), kernel_props, [=](auto) {
+        block2DCopyKernel(tensor_S, block_shape,
+            sv_layout);
+      });
+};
diff --git a/examples/cute/tutorial/transpose/block_2d_transposed_copy.h b/examples/cute/tutorial/transpose/block_2d_transposed_copy.h
deleted file mode 100644
index d061df9af0..0000000000
--- a/examples/cute/tutorial/transpose/block_2d_transposed_copy.h
+++ /dev/null
@@ -1,148 +0,0 @@
-#pragma once
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
- * reserved. Copyright (C) 2025 Intel Corporation, All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#include <cute/tensor.hpp>
-#include <cute/util/compat.hpp>
-#include <sycl/ext/intel/experimental/grf_size_properties.hpp>
-#include <sycl/sycl.hpp>
-
-#include "cutlass/util/print_error.hpp"
-#include "util.h"
-
-template <class TensorS, class TensorD, class BlockShape, class BlockShapeTrans,
-          class ThreadLayout>
-void block2DTransposedLoadKernel(TensorS const S, TensorD const DT,
-                                 BlockShape const block_shape,
-                                 BlockShapeTrans const block_shape_transposed,
-                                 ThreadLayout const thread_layout) {
-  using namespace cute;
-  using Element = typename TensorS::value_type;
-
-  /* get workgroup and local ids */
-  auto item = sycl::ext::oneapi::this_work_item::get_nd_item<2>();
-  auto wg_m = int(item.get_group(0));
-  auto wg_n = int(item.get_group(1));
-  auto local_id = int(item.get_local_id(0));
-
-  /* proxy coordinate tensor */
-  Tensor cS = make_identity_tensor(S.shape());   // (M,N)
-  Tensor cDT = make_identity_tensor(DT.shape()); // (N,M)
-
-  auto wg_coord = make_coord(wg_m, wg_n);
-  auto wg_coord_transposed = make_coord(wg_n, wg_m);
-
-  // Tensor data = ... // (  M,  N) Tensor cta_data = local_tile(data,
-  // Shape<16, 16>{}, make_coord(blockIdx.x,blockIdx.y));  // (_32,_64)
-  Tensor gS = local_tile(cS, block_shape, wg_coord); // (BLK_M,BLK_N)
-  Tensor gDT = local_tile(cDT, block_shape_transposed,
-                          wg_coord_transposed); // (BLK_N,BLK_M);
-
-  constexpr int CopyBits = sizeof_bits_v<Element>;
-  auto transposed_load_op = XE_LOAD_2D_TRANSPOSE<CopyBits, 8, 8>{};
-  auto store_op = XE_STORE_2D<CopyBits, 8, 8>{};
-
-  /* Slice TiledCopy operations to thread (work-item) level */
-  auto transpose_S = make_block_2d_copy(transposed_load_op, S);
-  auto thr_transpose_S = transpose_S.get_slice(local_id);
-
-  auto store_DT = make_block_2d_copy(store_op, DT);
-  auto thr_copy_DT = store_DT.get_slice(local_id);
-
-  /* Register fragments for transposed copy */
-  auto tSrS = thr_transpose_S.partition_sg_fragment_D(gS);
-  auto tDrD = thr_copy_DT.partition_sg_fragment_D(gDT);
-
-  /* Partition global tensor (proxies) for copies */
-  Tensor tSgS = thr_transpose_S.partition_S(gS);
-  Tensor tDgD = thr_copy_DT.partition_D(gDT);
-
-  // if ( cute::thread(0, 0)){
-  //     print(tSgS);print("\n");
-  //     print(tSrS);print("\n");
-  //     print(tDgD);print("\n");
-  // }
-
-  copy(transpose_S, tSgS, tSrS);
-  // copy(tSrS, tDrD);
-  copy(store_DT, tSrS, tDgD);
-}
-
-class TransposeCuteName;
-template <typename Element>
-void block_2d_transposed_copy(TransposeParams<Element> params) {
-
-  using namespace cute;
-  //
-  // Make Tensors
-  //
-  auto tensor_shape = make_shape(params.M, params.N);
-  auto tensor_shape_trans = make_shape(params.N, params.M);
-  auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
-  auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{});
-  Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
-  Tensor tensor_DT = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);
-
-  // Make a transposed view of the output
-  // auto gmemLayoutDT = make_layout(tensor_shape, GenColMajor{});
-  // Tensor tensor_DT = make_tensor(make_gmem_ptr(params.output), gmemLayoutDT);
-
-  sycl::queue Q;
-
-  //
-  // Tile tensors
-  //
-
-  using bM = Int<32>;
-  using bN = Int<8>;
-
-  auto block_shape = make_shape(bM{}, bN{});       // (bM, bN)
-  auto block_shape_trans = make_shape(bN{}, bM{}); // (bN, bM)
-
-  sycl::range<2> local = {bM{}, 1};
-  sycl::range<2> global = {local[0] * ceil_div(shape<0>(tensor_S), bM{}),
-                           local[1] * ceil_div(shape<1>(tensor_S), bN{})};
-
-  auto threadLayout = make_layout(make_shape(bM{}, Int<1>{}), LayoutRight{});
-
-  namespace syclex = sycl::ext::oneapi::experimental;
-  namespace intelex = sycl::ext::intel::experimental;
-
-  syclex::properties kernel_props{syclex::sub_group_size<16>,
-                                  intelex::grf_size<256>};
-
-  auto event = Q.parallel_for<TransposeCuteName>(
-      sycl::nd_range<2>(global, local), kernel_props, [=](auto) {
-        block2DTransposedLoadKernel(tensor_S, tensor_DT, block_shape,
-                                    block_shape_trans, threadLayout);
-      });
-};