From 71829b7742e9f454a4ee432f97b3c50982be2d8f Mon Sep 17 00:00:00 2001 From: Suryaprakash Shanmugam Date: Tue, 28 Oct 2025 22:37:37 -0700 Subject: [PATCH 1/4] Add CuTe Matrix Transpose tutorial --- examples/common/sycl_cute_common.hpp | 16 ++ examples/cute/tutorial/CMakeLists.txt | 5 + .../cute/tutorial/transpose/copy_direct.h | 141 +++++++++++++++ examples/cute/tutorial/transpose/copy_smem.h | 148 ++++++++++++++++ .../transpose/tiled_transpose_sycl.cpp | 29 ++++ .../cute/tutorial/transpose/transpose_naive.h | 115 ++++++++++++ .../cute/tutorial/transpose/transpose_smem.h | 163 ++++++++++++++++++ examples/cute/tutorial/transpose/util.h | 103 +++++++++++ include/cute/util/compat/traits.hpp | 2 +- include/cutlass/platform/platform.h | 2 + 10 files changed, 723 insertions(+), 1 deletion(-) create mode 100644 examples/cute/tutorial/transpose/copy_direct.h create mode 100644 examples/cute/tutorial/transpose/copy_smem.h create mode 100644 examples/cute/tutorial/transpose/tiled_transpose_sycl.cpp create mode 100644 examples/cute/tutorial/transpose/transpose_naive.h create mode 100644 examples/cute/tutorial/transpose/transpose_smem.h create mode 100644 examples/cute/tutorial/transpose/util.h diff --git a/examples/common/sycl_cute_common.hpp b/examples/common/sycl_cute_common.hpp index 4875a1a57b..1018d043d6 100644 --- a/examples/common/sycl_cute_common.hpp +++ b/examples/common/sycl_cute_common.hpp @@ -94,6 +94,22 @@ zero_fill(InTensor &X) X(i) = T(0); } +template +void +random_fill(std::vector &X) { + + for (int i = 0; i < X.size(); i++) + X[i] = random_value(); +} + +template +void +zero_fill(std::vector &X) { + for (int i = 0; i < X.size(); i++) + X[i] = T(0); +} + + // Pack sub-byte types in a gmem tensor. // On input, the backing array holds one sub-byte value per byte. // On exit, the backing array contains packed values. diff --git a/examples/cute/tutorial/CMakeLists.txt b/examples/cute/tutorial/CMakeLists.txt index 673e968e60..d868a21e1b 100644 --- a/examples/cute/tutorial/CMakeLists.txt +++ b/examples/cute/tutorial/CMakeLists.txt @@ -45,6 +45,11 @@ if (CUTLASS_ENABLE_SYCL) tiled_copy_sycl.cpp ) + cutlass_example_add_executable( + cute_tutorial_tiled_transpose + transpose/tiled_transpose_sycl.cpp + ) + cutlass_example_add_executable( cute_tutorial_tiled_copy_if tiled_copy_if_sycl.cpp diff --git a/examples/cute/tutorial/transpose/copy_direct.h b/examples/cute/tutorial/transpose/copy_direct.h new file mode 100644 index 0000000000..10861b360e --- /dev/null +++ b/examples/cute/tutorial/transpose/copy_direct.h @@ -0,0 +1,141 @@ +#pragma once + +/*************************************************************************************************** + * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. Copyright (C) 2025 Intel Corporation, All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +// copy kernel adapted from +// https://github.com/NVIDIA/cutlass/blob/main/examples/cute/tutorial/tiled_copy.cu + +#include +#include + +#include + +#include "cutlass/util/print_error.hpp" +#include "util.h" + +#include + +template +void copy_kernel(TensorS S, TensorD D, ThreadLayout) { + using namespace cute; + + // Slice the tiled tensors + Tensor tile_S = S(make_coord(_, _), compat::work_group_id::x(), + compat::work_group_id::y()); // (BlockShape_M, BlockShape_N) + Tensor tile_D = D(make_coord(_, _), compat::work_group_id::x(), + compat::work_group_id::y()); // (BlockShape_M, BlockShape_N) + + // Construct a partitioning of the tile among threads with the given thread + // arrangement. + + // Concept: Tensor ThrLayout ThrIndex + Tensor thr_tile_S = local_partition( + tile_S, ThreadLayout{}, compat::local_id::x()); // (ThrValM, ThrValN) + Tensor thr_tile_D = local_partition( + tile_D, ThreadLayout{}, compat::local_id::x()); // (ThrValM, ThrValN) + // + + // Construct a register-backed Tensor with the same shape as each thread's + // partition Use make_tensor to try to match the layout of thr_tile_S + Tensor fragment = make_tensor_like(thr_tile_S); // (ThrValM, ThrValN) + + // Copy from GMEM to RMEM and from RMEM to GMEM + copy(thr_tile_S, fragment); + copy(fragment, thr_tile_D); +} + +template void copy_direct(TransposeParams params) { + // + // Given a 2D shape, perform an efficient copy + // + + using namespace cute; + + // + // Make tensors + // + auto tensor_shape = make_shape(params.M, params.N); + auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{}); + auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{}); + Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS); + Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD); + + // + // Tile tensors + // + + // Define a statically sized block (M, N). + // Note, by convention, capital letters are used to represent static modes. + auto block_shape = make_shape(Int<1>{}, Int<16384>{}); + + if ((size<0>(tensor_shape) % size<0>(block_shape)) || + (size<1>(tensor_shape) % size<1>(block_shape))) { + std::cerr << "The tensor shape must be divisible by the block shape." + << std::endl; + } + // Equivalent check to the above + if (not evenly_divides(tensor_shape, block_shape)) { + std::cerr << "Expected the block_shape to evenly divide the tensor shape." + << std::endl; + } + + // Tile the tensor (m, n) ==> ((M, N), m', n') where (M, N) is the static tile + // shape, and modes (m', n') correspond to the number of tiles. + // + // These will be used to determine the CUDA kernel grid dimensions. + Tensor tiled_tensor_S = + tiled_divide(tensor_S, block_shape); // ((M, N), m', n') + Tensor tiled_tensor_D = + tiled_divide(tensor_D, block_shape); // ((M, N), m', n') + + // Thread arrangement + Layout thr_layout = + make_layout(make_shape(Int<1>{}, Int<1024>{}), LayoutRight{}); + + // + // Determine grid and block dimensions + // + + auto gridDim = compat::dim3( + size<1>(tiled_tensor_S), + size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n' + auto blockDim = compat::dim3(size(thr_layout)); + + // + // Launch the kernel + // + compat::launch>( + gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, thr_layout); +} diff --git a/examples/cute/tutorial/transpose/copy_smem.h b/examples/cute/tutorial/transpose/copy_smem.h new file mode 100644 index 0000000000..18f3fef90d --- /dev/null +++ b/examples/cute/tutorial/transpose/copy_smem.h @@ -0,0 +1,148 @@ +#pragma once +/*************************************************************************************************** + * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. Copyright (C) 2025 Intel Corporation, All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include +#include + +#include + +#include "cutlass/util/print_error.hpp" +#include "util.h" + +#include "cutlass/detail/layout.hpp" + +// Shared Storage for aligned addresses +template struct SharedStorageCopy { + cute::array_aligned> smem; +}; + +template +void copySmemKernel(TensorS const S, TensorD const D, ThreadLayout, + SmemLayout) { + using namespace cute; + using Element = typename TensorS::value_type; + + // Use Shared Storage structure to allocate aligned SMEM addresses. + using SharedStorage = SharedStorageCopy; + auto smem = compat::local_mem(); + SharedStorage &shared_storage = *reinterpret_cast(smem); + + Tensor gS = S(make_coord(_, _), compat::work_group_id::x(), + compat::work_group_id::y()); // (bM, bN) + Tensor gD = D(make_coord(_, _), compat::work_group_id::x(), + compat::work_group_id::y()); // (bN, bM) + + Tensor sS = make_tensor(make_smem_ptr(shared_storage.smem.data()), + SmemLayout{}); // (bN, bM) + + auto tiled_copy_load = make_tiled_copy( + Copy_Atom, Element>{}, + ThreadLayout{}); + + auto tiled_copy_store = make_tiled_copy( + Copy_Atom, Element>{}, + ThreadLayout{}); + // + // Construct a Tensor corresponding to each thread's slice. + auto thr_copy_load = tiled_copy_load.get_thread_slice(compat::local_id::x()); + auto thr_copy_store = + tiled_copy_store.get_thread_slice(compat::local_id::x()); + + Tensor tSgS = thr_copy_load.partition_S(gS); + Tensor tSsS = thr_copy_load.partition_D(sS); + // + Tensor tDsS = thr_copy_store.partition_D(sS); + Tensor tDgD = thr_copy_store.partition_D(gD); + + copy(tiled_copy_load, tSgS, tSsS); + + cp_async_fence(); + cp_async_wait<0>(); + syncthreads(); + // + copy(tiled_copy_store, tDsS, tDgD); +} + +template void copy_smem(TransposeParams params) { + + using namespace cute; + + // + // Make tensors + // + auto tensor_shape = make_shape(params.M, params.N); + auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{}); + auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{}); + Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS); + Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD); + + // + // Tile tensors + using bM = Int<1>; + using bN = Int<8192>; + + auto block_shape = make_shape(bM{}, bN{}); // (bM, bN) + + auto smem_layout = make_layout(block_shape, LayoutRight{}); + + Tensor tiled_tensor_S = + tiled_divide(tensor_S, block_shape); // ((bM, bN), m', n') + Tensor tiled_tensor_D = + tiled_divide(tensor_D, block_shape); // ((bN, bM), n', m') + + auto threadLayout = + make_layout(make_shape(Int<1>{}, Int<1024>{}), LayoutRight{}); + + // + // Determine grid and block dimensions + // + + dim3 gridDim( + size<1>(tiled_tensor_S), + size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n' + dim3 blockDim(size(threadLayout)); // 256 threads + + constexpr int smem_size = + int(sizeof(SharedStorageCopy)); + + // + // Launch the kernel + // + compat::launch< + copySmemKernel>( + gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, threadLayout, + smem_layout); +} diff --git a/examples/cute/tutorial/transpose/tiled_transpose_sycl.cpp b/examples/cute/tutorial/transpose/tiled_transpose_sycl.cpp new file mode 100644 index 0000000000..c179fa6485 --- /dev/null +++ b/examples/cute/tutorial/transpose/tiled_transpose_sycl.cpp @@ -0,0 +1,29 @@ +#include "copy_direct.h" +#include "copy_smem.h" +#include "transpose_naive.h" +#include "transpose_smem.h" +#include "util.h" + +int main(int argc, char const **argv) { + + using Element = float; + + int size = 16384; + int M = size, N = size, iterations = 10; + + std::cout << "Matrix size: " << M << " x " << N << std::endl; + + printf("Baseline copy.\n"); + benchmark(copy_direct, M, N, iterations); + + printf("\nNaive transpose (no smem):\n"); + benchmark(transpose_naive, M, N, iterations); + + printf("\nCopy through SMEM.\n"); + benchmark(copy_smem, M, N, iterations); + + printf("\nTranspose through SMEM.:\n"); + benchmark(transpose_smem, M, N, iterations); + + return 0; +} diff --git a/examples/cute/tutorial/transpose/transpose_naive.h b/examples/cute/tutorial/transpose/transpose_naive.h new file mode 100644 index 0000000000..d73203a1af --- /dev/null +++ b/examples/cute/tutorial/transpose/transpose_naive.h @@ -0,0 +1,115 @@ +#pragma once +/*************************************************************************************************** + * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. Copyright (C) 2025 Intel Corporation, All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#include +#include + +#include + +#include "cutlass/util/print_error.hpp" +#include "util.h" + +template +void transposeKernelNaive(TensorS const S, TensorD const DT, + ThreadLayoutS const tS, ThreadLayoutD const tD) { + using namespace cute; + using Element = typename TensorS::value_type; + + Tensor gS = S(make_coord(_, _), compat::work_group_id::x(), + compat::work_group_id::y()); // (bM, bN) + Tensor gDT = DT(make_coord(_, _), compat::work_group_id::x(), + compat::work_group_id::y()); // (bM, bN) + + Tensor tSgS = local_partition(gS, ThreadLayoutS{}, + compat::local_id::x()); // (ThrValM, ThrValN) + Tensor tDgDT = local_partition(gDT, ThreadLayoutD{}, compat::local_id::x()); + + Tensor rmem = make_tensor_like(tSgS); + + copy(tSgS, rmem); + copy(rmem, tDgDT); +} + +template +void transpose_naive(TransposeParams params) { + + using namespace cute; + // + // Make Tensors + // + auto tensor_shape = make_shape(params.M, params.N); + auto tensor_shape_trans = make_shape(params.N, params.M); + auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{}); + auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{}); + Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS); + Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD); + + // Make a transposed view of the output + auto gmemLayoutDT = make_layout(tensor_shape, GenColMajor{}); + Tensor tensor_DT = make_tensor(make_gmem_ptr(params.output), gmemLayoutDT); + + // + // Tile tensors + // + + using bM = Int<8>; + using bN = Int<512>; + + auto block_shape = make_shape(bM{}, bN{}); // (bM, bN) + auto block_shape_trans = make_shape(bN{}, bM{}); // (bN, bM) + + Tensor tiled_tensor_S = + tiled_divide(tensor_S, block_shape); // ((bM, bN), m', n') + Tensor tiled_tensor_DT = + tiled_divide(tensor_DT, block_shape); // ((bM, bN), m', n') + + auto threadLayoutS = + make_layout(make_shape(Int<8>{}, Int<64>{}), LayoutRight{}); + auto threadLayoutD = + make_layout(make_shape(Int<8>{}, Int<64>{}), LayoutRight{}); + + auto gridDim = compat::dim3( + size<1>(tiled_tensor_S), + size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n' + auto blockDim = compat::dim3(size(threadLayoutS)); + + // + // Launch the kernel + // + compat::launch< + transposeKernelNaive>( + gridDim, blockDim, tiled_tensor_S, tiled_tensor_DT, threadLayoutS, + threadLayoutD); +}; diff --git a/examples/cute/tutorial/transpose/transpose_smem.h b/examples/cute/tutorial/transpose/transpose_smem.h new file mode 100644 index 0000000000..51aa1bddba --- /dev/null +++ b/examples/cute/tutorial/transpose/transpose_smem.h @@ -0,0 +1,163 @@ +#pragma once +/*************************************************************************************************** + * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. Copyright (C) 2025 Intel Corporation, All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include +#include + +#include + +#include "cutlass/util/print_error.hpp" +#include "util.h" + +#include "cutlass/detail/layout.hpp" + +// Shared Storage for aligned addresses +template struct SharedStorageTranspose { + cute::array_aligned, + cutlass::detail::alignment_for_swizzle(SmemLayout{})> + smem; +}; + +template +void transposeSmemKernel(TensorS const S, TensorD const D, + SmemLayoutS const smemLayoutS, ThreadLayoutS const tS, + SmemLayoutD const smemLayoutD, + ThreadLayoutD const tD) { + using namespace cute; + using Element = typename TensorS::value_type; + + // Use Shared Storage structure to allocate aligned SMEM addresses. + using SharedStorage = SharedStorageTranspose; + auto smem = compat::local_mem(); + SharedStorage &shared_storage = *reinterpret_cast(smem); + + // two different views of smem + Tensor sS = make_tensor(make_smem_ptr(shared_storage.smem.data()), + smemLayoutS); // (bM, bN) + Tensor sD = make_tensor(make_smem_ptr(shared_storage.smem.data()), + smemLayoutD); // (bN, bM) + + Tensor gS = S(make_coord(_, _), compat::work_group_id::x(), + compat::work_group_id::y()); // (bM, bN) + Tensor gD = D(make_coord(_, _), compat::work_group_id::y(), + compat::work_group_id::x()); // (bN, bM) + + Tensor tSgS = + local_partition(gS, tS, compat::local_id::x()); // (ThrValM, ThrValN) + Tensor tSsS = + local_partition(sS, tS, compat::local_id::x()); // (ThrValM, ThrValN) + Tensor tDgD = local_partition(gD, tD, compat::local_id::x()); + Tensor tDsD = local_partition(sD, tD, compat::local_id::x()); + + cute::copy(tSgS, tSsS); // LDGSTS + + cp_async_fence(); + cp_async_wait<0>(); + syncthreads(); + + cute::copy(tDsD, tDgD); +} + +template +void transpose_smem(TransposeParams params) { + + using namespace cute; + + // + // Make tensors + // + auto tensor_shape = make_shape(params.M, params.N); + auto tensor_shape_trans = make_shape(params.N, params.M); + auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{}); + auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{}); + Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS); + Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD); + + // + // Tile tensors + // + + using bM = Int<64>; + using bN = Int<128>; + + auto block_shape = make_shape(bM{}, bN{}); // (bM, bN) + auto block_shape_trans = make_shape(bN{}, bM{}); // (bN, bM) + + Tensor tiled_tensor_S = + tiled_divide(tensor_S, block_shape); // ((bM, bN), m', n') + Tensor tiled_tensor_D = + tiled_divide(tensor_D, block_shape_trans); // ((bN, bM), n', m') + + auto tileShapeS = make_layout(block_shape, LayoutRight{}); + auto tileShapeD = make_layout(block_shape_trans, LayoutRight{}); + + auto smemLayoutS = tileShapeS; + auto smemLayoutD = composition(smemLayoutS, tileShapeD); + auto smemLayoutS_swizzle = composition(Swizzle<5, 0, 5>{}, tileShapeS); + auto smemLayoutD_swizzle = composition(smemLayoutS_swizzle, tileShapeD); + + auto threadLayoutS = + make_layout(make_shape(Int<8>{}, Int<64>{}), LayoutRight{}); + auto threadLayoutD = + make_layout(make_shape(Int<8>{}, Int<64>{}), LayoutRight{}); + + constexpr int smem_size = + int(sizeof(SharedStorageTranspose)); + + // + // Determine grid and block dimensions + // + + dim3 gridDim( + size<1>(tiled_tensor_S), + size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n' + dim3 blockDim(size(threadLayoutS)); // 256 threads + + if constexpr (isSwizzled) { + compat::launch>( + gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, smemLayoutS_swizzle, + threadLayoutS, smemLayoutD_swizzle, threadLayoutD); + } else { + compat::launch>( + gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, smemLayoutS, + threadLayoutS, smemLayoutD, threadLayoutD); + } +} diff --git a/examples/cute/tutorial/transpose/util.h b/examples/cute/tutorial/transpose/util.h new file mode 100644 index 0000000000..984fd12689 --- /dev/null +++ b/examples/cute/tutorial/transpose/util.h @@ -0,0 +1,103 @@ +#pragma once + +#include "../../../common/sycl_cute_common.hpp" +#include + +template struct TransposeParams { + T *__restrict__ input; + T *__restrict__ output; + + const int M; + const int N; + + TransposeParams(T *__restrict__ input_, T *__restrict__ output_, int M_, + int N_) + : input(input_), output(output_), M(M_), N(N_) {} +}; + +template +int benchmark(void (*transpose)(TransposeParams params), int M, int N, + int iterations = 10, bool verify = true) { + using namespace cute; + + auto tensor_shape_S = make_shape(M, N); + auto tensor_shape_D = (isTranspose) ? make_shape(N, M) : make_shape(M, N); + + // + // Allocate and initialize + // + std::vector h_S(size(tensor_shape_S)); + std::vector h_D(size(tensor_shape_D)); + + auto d_S = compat::malloc(size(tensor_shape_S)); + auto d_D = compat::malloc(size(tensor_shape_D)); + + if (not is_random) { + for (size_t i = 0; i < h_S.size(); ++i) { + h_S[i] = static_cast(i); + } + } else { + random_fill(h_S); + } + + compat::memcpy(d_S, h_S.data(), size(tensor_shape_S)); + + TransposeParams params(d_S, d_D, M, N); + + for (int i = 0; i < iterations; i++) { + auto t1 = std::chrono::high_resolution_clock::now(); + transpose(params); + compat::wait_and_throw(); + auto t2 = std::chrono::high_resolution_clock::now(); + std::chrono::duration tDiff = t2 - t1; + double time_ms = tDiff.count(); + double M_ = double(M); + double N_ = double(N); + double bytes = 2 * M_ * N_ * sizeof(T); + + std::cout << "Trial " << i << " Completed in " << time_ms << "ms (" + << std::fixed << std::setprecision(2) << 1e-6 * bytes / time_ms + << " GB/s)" << std::endl; + } + + if (verify) { + compat::memcpy(h_D.data(), d_D, size(tensor_shape_D)); + + int bad = 0; + if constexpr (isTranspose) { + auto transpose_function = make_layout(tensor_shape_S, LayoutRight{}); + for (size_t i = 0; i < h_D.size(); ++i) + if (h_D[i] != h_S[transpose_function(i)]) + bad++; + } else { + for (size_t i = 0; i < h_D.size(); ++i) + if (h_D[i] != h_S[i]) + bad++; + } +#if 0 + for (size_t i = 0; i < M; ++i) { + for (size_t j = 0; j < N; ++j) { + std::cout << (int)h_S[i * N + j] << "\t"; + } + std::cout << std::endl; + } + std::cout << std::endl; + for (size_t i = 0; i < M; ++i) { + for (size_t j = 0; j < N; ++j) { + std::cout << (int)h_D[i * N + j] << "\t"; + } + std::cout << std::endl; + } + +#endif + + if (bad > 0) { + std::cout << "Validation failed. Correct values: " << h_D.size() - bad + << ". Incorrect values: " << bad << std::endl; + } else { + std::cout << "Validation success." << std::endl; + } + } + return 0; +} diff --git a/include/cute/util/compat/traits.hpp b/include/cute/util/compat/traits.hpp index fcb3f3bc43..cd94efbd03 100644 --- a/include/cute/util/compat/traits.hpp +++ b/include/cute/util/compat/traits.hpp @@ -89,7 +89,7 @@ template struct range_to_item_map> { using ItemT = sycl::nd_item; }; template struct range_to_item_map> { - using ItemT = sycl::item; + using ItemT = sycl::item; }; template diff --git a/include/cutlass/platform/platform.h b/include/cutlass/platform/platform.h index 7e3816394e..1b3b6227cd 100644 --- a/include/cutlass/platform/platform.h +++ b/include/cutlass/platform/platform.h @@ -866,6 +866,8 @@ struct numeric_limits { CUTLASS_HOST_DEVICE static constexpr float infinity() noexcept { return bit_cast(0x7f800000);} CUTLASS_HOST_DEVICE + static constexpr float lowest() noexcept { return -bit_cast(0x7f7fffff) - 1;} + CUTLASS_HOST_DEVICE static constexpr float max() noexcept { return bit_cast(0x7f7fffff);} static constexpr bool is_integer = false; static constexpr bool has_infinity = true; From 1f53721adeba8f60b63a76589716d345d8d59ca4 Mon Sep 17 00:00:00 2001 From: Suryaprakash Shanmugam Date: Tue, 28 Oct 2025 22:37:42 -0700 Subject: [PATCH 2/4] Add the native sycl kernel code --- .../{tiled_transpose_sycl.cpp => main.cpp} | 0 .../tutorial/transpose/transpose_sycl.cpp | 544 ++++++++++++++++++ 2 files changed, 544 insertions(+) rename examples/cute/tutorial/transpose/{tiled_transpose_sycl.cpp => main.cpp} (100%) create mode 100644 examples/cute/tutorial/transpose/transpose_sycl.cpp diff --git a/examples/cute/tutorial/transpose/tiled_transpose_sycl.cpp b/examples/cute/tutorial/transpose/main.cpp similarity index 100% rename from examples/cute/tutorial/transpose/tiled_transpose_sycl.cpp rename to examples/cute/tutorial/transpose/main.cpp diff --git a/examples/cute/tutorial/transpose/transpose_sycl.cpp b/examples/cute/tutorial/transpose/transpose_sycl.cpp new file mode 100644 index 0000000000..6e4a31d1b4 --- /dev/null +++ b/examples/cute/tutorial/transpose/transpose_sycl.cpp @@ -0,0 +1,544 @@ +/* + * Objective: Transpose a square matrix tile of size 32 on a side + * + * */ + +/* + * Work Group Configuration: Each work group handles one tile. + + Dimensions: (TILE_DIM x BLOCK_ROWS) = (32 x 8) work-items. + This means each work group contains 32 * 8 = 256 work-items. + For a 32x32 tile (1024 elements), each work-item processes 1024 / 256 = 4 + elements. + + Example Work Group for Tile (0,0): + Thread indices within the work group (local_id): + localID[0] + ^ + | + 7 | t(0,7) t(1,7) t(2,7) ... t(31,7) + 6 | t(0,6) t(1,6) t(2,6) ... t(31,6) + 5 | t(0,5) t(1,5) t(2,5) ... t(31,5) + 4 | t(0,4) t(1,4) t(2,4) ... t(31,4) + 3 | t(0,3) t(1,3) t(2,3) ... t(31,3) + 2 | t(0,2) t(1,2) t(2,2) ... t(31,2) + 1 | t(0,1) t(1,1) t(2,1) ... t(31,1) + 0 | t(0,0) t(1,0) t(2,0) ... t(31,0) --> localId[1]: 0 1 2 ... 31 + +-------------------------------------> + */ + +#include + +#include +#include +#include + +#include "benchmark.h" + +// size of the entire square matrix NrN +// we still use separate variables for the sides so we can +// think about tile and block indexing in the matrix rows/cols +constexpr size_t N = 16384; +constexpr size_t Nr = N; +constexpr size_t Nc = N; + +// size of a single data tile that we will work with +// we use 16 here to demonstrate bank conflicts on intel gpus +constexpr size_t TILE_DIM = 64; + +// number of rows in our workgroup +// intentionally this is a smaller number because we want to use +// a single thread to copy 4 elements +constexpr size_t BLOCK_ROWS = TILE_DIM / 4; + +constexpr size_t numIters = 100; + +typedef unsigned int uint; +using T = float; + +template auto get_accessor_pointer(const AccT &acc) { + return acc.template get_multi_ptr().get(); +} + +int main() { + std::vector A(Nr * Nc); + std::vector A_T(Nr * Nc); + std::vector A_T_ref(Nr * Nc); + + if (Nr % TILE_DIM or Nc % TILE_DIM) { + throw std::runtime_error("Nr and Nc must be a multiple of TILE_DIM"); + } + + if (TILE_DIM % BLOCK_ROWS) { + throw std::runtime_error("TILE_DIM must be a multiple of BLOCK_ROWS"); + } + + // fill the matrix and prep ref output on the host + for (int i = 0; i < Nr; i++) + for (int j = 0; j < Nc; j++) + A[i * Nr + j] = i * Nr + j; // data same as the linear physical index + + // for the ref transpose out, flip the quickest varying index on the reads + for (int i = 0; i < Nr; i++) + for (int j = 0; j < Nc; j++) + A_T_ref[i * Nr + j] = j * Nr + i; + + try { + auto q = sycl::queue{sycl::property::queue::enable_profiling{}}; + + std::cout << "Running on " + << q.get_device().get_info() << "\n"; + std::cout << "Local Memory Size: " + << q.get_device().get_info() / + 1024 + << "KB" << std::endl; + std::cout + << "Max Work Group Size: " + << q.get_device().get_info() + << std::endl; + + sycl::range dataRange{Nr, Nc}; + // div y dim by 4 as we use a single work-item to move 4 values + sycl::range globalRange{Nr / 4, Nc}; + sycl::range localRange{BLOCK_ROWS, TILE_DIM}; + sycl::nd_range ndRange{globalRange, localRange}; + + { + sycl::buffer h_idata{A.data(), dataRange}; + sycl::buffer h_odata{A_T.data(), dataRange}; + + // Simple copy without coalescing to demonstrate its inefficiency + auto simple_copy = [&]() { + q.submit([&](sycl::handler &cgh) { + sycl::accessor d_idata{h_idata, cgh, sycl::read_only}; + sycl::accessor d_odata{h_odata, cgh, sycl::write_only, + sycl::property::no_init{}}; + + cgh.parallel_for( + ndRange, [=](sycl::nd_item<2> item) { + auto globalId = item.get_global_id(); + auto localId = item.get_local_id(); + + auto row_id = item.get_group(0) * TILE_DIM + localId[0]; + auto col_id = globalId[1]; + + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + sycl::id dataIdx{col_id, row_id + i}; + d_odata[dataIdx] = d_idata[dataIdx]; + } + }); + }); + q.wait_and_throw(); + }; + + // Simple copy with coalescing used as reference for best effective + // bandwidth + auto simple_coalesced_copy = [&]() { + q.submit([&](sycl::handler &cgh) { + sycl::accessor d_idata{h_idata, cgh, sycl::read_only}; + sycl::accessor d_odata{h_odata, cgh, sycl::write_only, + sycl::property::no_init{}}; + + cgh.parallel_for( + ndRange, [=](sycl::nd_item<2> item) { + auto globalId = item.get_global_id(); + auto localId = item.get_local_id(); + + // get_group(0) gives the work-group id along the row dim + // since we need to compute the group offset here with + // TILE_DIM; Just using the global id wouldn't work here + // because we don't have a 1:1 thread:value + // mapping here) + auto row_id = item.get_group(0) * TILE_DIM + localId[0]; + // work-items of the fastest varying dimension (1) access + // consecutive memory locations so that loads and stores + // are coalesced + auto col_id = globalId[1]; + + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + sycl::id dataIdx{row_id + i, col_id}; + d_odata[dataIdx] = d_idata[dataIdx]; + } + }); + }); + q.wait_and_throw(); + }; + + // Naive Transpose + // reads are coalesced, but writes are not + auto naive_transpose = [&]() { + q.submit([&](sycl::handler &cgh) { + sycl::accessor d_idata{h_idata, cgh, sycl::read_only}; + sycl::accessor d_odata{h_odata, cgh, sycl::write_only, + sycl::property::no_init{}}; + + cgh.parallel_for( + ndRange, [=](sycl::nd_item<2> item) { + auto globalId = item.get_global_id(); + auto localId = item.get_local_id(); + + auto row_id = item.get_group(0) * TILE_DIM + localId[0]; + auto col_id = globalId[1]; + + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + sycl::id idataIdx{row_id + i, col_id}; + // swap the output buffer's indices to transpose it + sycl::id odataIdx{col_id, row_id + i}; + d_odata[odataIdx] = d_idata[idataIdx]; + } + }); + }); + q.wait_and_throw(); + }; + + // Tiled copy through SMEM as a baseline for SMEM transpose + auto smem_copy = [&]() { + q.submit([&](sycl::handler &cgh) { + sycl::accessor d_idata{h_idata, cgh, sycl::read_only}; + sycl::accessor d_odata{h_odata, cgh, sycl::write_only, + sycl::property::no_init{}}; + sycl::range tileRange{TILE_DIM, TILE_DIM}; + sycl::local_accessor sharedMemTile{tileRange, cgh}; + + cgh.parallel_for( + ndRange, [=](sycl::nd_item<2> item) { + auto globalId = item.get_global_id(); + auto localId = item.get_local_id(); + + auto groupOffset_0 = item.get_group(0) * TILE_DIM; + auto groupOffset_1 = item.get_group(1) * TILE_DIM; + + auto row_id = groupOffset_0 + localId[0]; + auto col_id = groupOffset_1 + localId[1]; + + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + sycl::id dataIdx{row_id + i, col_id}; + sycl::id smemTileIdx{localId[0] + i, localId[1]}; + + // coalesced read from gmem into smem + sharedMemTile[smemTileIdx] = d_idata[dataIdx]; + } + + // We need to wait here to ensure that all work items + // have written to local memory before we start reading + // from it. + sycl::group_barrier(item.get_group()); + + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + sycl::id dataIdx{row_id + i, col_id}; + sycl::id smemTileIdx{localId[0] + i, localId[1]}; + + // coalesced write to gmem from smem + d_odata[dataIdx] = sharedMemTile[smemTileIdx]; + } + }); + }); + q.wait_and_throw(); + }; + + // Coalesce reads and writes to global memory but do the strided + // access required for the transpose in shared local memory as it + // doesn't levy as much a much penalty when done in SLM compared to + // GMEM + auto smem_transpose = [&]() { + q.submit([&](sycl::handler &cgh) { + sycl::accessor d_idata{h_idata, cgh, sycl::read_only}; + sycl::accessor d_odata{h_odata, cgh, sycl::write_only, + sycl::property::no_init{}}; + sycl::range tileRange{TILE_DIM, TILE_DIM}; + sycl::local_accessor sharedMemTile{tileRange, cgh}; + + cgh.parallel_for( + ndRange, [=](sycl::nd_item<2> item) { + auto globalId = item.get_global_id(); + auto localId = item.get_local_id(); + + auto groupOffset_0 = item.get_group(0) * TILE_DIM; + auto groupOffset_1 = item.get_group(1) * TILE_DIM; + + auto row_id = groupOffset_0 + localId[0]; + auto col_id = groupOffset_1 + localId[1]; + + auto row_id_T = groupOffset_1 + localId[0]; + auto col_id_T = groupOffset_0 + localId[1]; + + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + sycl::id dataIdx{row_id + i, col_id}; + sycl::id smemTileIdx{localId[0] + i, localId[1]}; + + // coalesced read from gmem into smem + sharedMemTile[smemTileIdx] = d_idata[dataIdx]; + } + + // We need to wait here to ensure that all work items + // have written to local memory before we start reading + // from it. + sycl::group_barrier(item.get_group()); + + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + sycl::id dataIdx{row_id_T + i, col_id_T}; + // this creates strided reads in smem, but the writes + // to gmem are still coalesced + sycl::id smemTileIdx{localId[1], localId[0] + i}; + d_odata[dataIdx] = sharedMemTile[smemTileIdx]; + } + }); + }); + q.wait_and_throw(); + }; + + // SMEM Transpose avoiding bank conflict by allocating TILE_DIM + 1 on + // SMEM column dimension, causing every element in the smem to fall in + // a different shared memory bank; kernel is the same as above + auto smem_transpose_no_bank_conflict = [&]() { + q.submit([&](sycl::handler &cgh) { + sycl::accessor d_idata{h_idata, cgh, sycl::read_only}; + sycl::accessor d_odata{h_odata, cgh, sycl::write_only, + sycl::property::no_init{}}; + sycl::range tileRange{TILE_DIM, TILE_DIM + 1}; + sycl::local_accessor sharedMemTile{tileRange, cgh}; + + cgh.parallel_for( + ndRange, [=](sycl::nd_item<2> item) { + auto globalId = item.get_global_id(); + auto localId = item.get_local_id(); + + auto groupOffset_0 = item.get_group(0) * TILE_DIM; + auto groupOffset_1 = item.get_group(1) * TILE_DIM; + + auto row_id = groupOffset_0 + localId[0]; + auto col_id = groupOffset_1 + localId[1]; + + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + sycl::id dataIdx{row_id + i, col_id}; + sycl::id smemTileIdx{localId[0] + i, localId[1]}; + + // coalesced read from gmem into smem + sharedMemTile[smemTileIdx] = d_idata[dataIdx]; + } + + // We need to wait here to ensure that all work items + // have + // written to local memory before we start reading from + // it + sycl::group_barrier(item.get_group()); + + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + // output tile offsets need to be swapped + sycl::id dataIdx{groupOffset_1 + localId[0] + i, + groupOffset_0 + localId[1]}; + // this creates strided reads in smem, but the writes + // to + // gmem are still coalesced + sycl::id smemTileIdx{localId[1], localId[0] + i}; + d_odata[dataIdx] = sharedMemTile[smemTileIdx]; + } + }); + }); + q.wait_and_throw(); + }; + + // // transpose using subgroup shuffle functions + // util::benchmark( + // [&]() { + // q.submit([&](sycl::handler &cgh) { + // sycl::accessor d_idata{h_idata, cgh, sycl::read_only}; + // sycl::accessor d_odata{h_odata, cgh, sycl::write_only, + // sycl::property::no_init{}}; + // + // // this kernel requires the tile size to be equal to the + // // sub-group size used so we can use the sub-group shuffle + // // functions + // constexpr size_t BLOCK_SIZE = 16; + // cgh.parallel_for( + // sycl::nd_range<2>(sycl::range<2>(Nr / BLOCK_SIZE, Nc), + // sycl::range<2>(1, BLOCK_SIZE)), + // [=](sycl::nd_item<2> it) + // [[sycl::reqd_sub_group_size(16)]] + // { + // auto localId = it.get_local_id(); + // int gi = it.get_group(0); + // int gj = it.get_group(1); + // + // auto sg = it.get_sub_group(); + // uint sgId = sg.get_local_id()[0]; + // + // float bcol[BLOCK_SIZE]; + // int ai = BLOCK_SIZE * gi; + // int aj = BLOCK_SIZE * gj; + // + // for (uint k = 0; k < BLOCK_SIZE; k++) { + // // load columns of the matrix tile into the + // subgroup bcol[k] = + // sg.load(get_accessor_pointer(d_idata) + + // (ai + k) * Nc + aj); + // } + // + // // no barriers required here because the threads of a + // // sub-group execute concurrently, so all columns of + // the + // // matrix were loaded into bcol already + // + // float tcol[BLOCK_SIZE]; + // for (uint n = 0; n < BLOCK_SIZE; n++) { + // if (sgId == n) { + // for (uint k = 0; k < BLOCK_SIZE; k++) { + // // returns the value of bcol[n] from the k-th + // // work-item + // tcol[k] = sycl::select_from_group(sg, bcol[n], + // k); + // } + // } + // } + // + // for (uint k = 0; k < BLOCK_SIZE; k++) { + // sg.store(get_accessor_pointer(d_odata) + (aj + k) * + // Nc + // + + // ai, + // tcol[k]); + // } + // }); + // }); + // q.wait_and_throw(); + // }, + // numIters, Nc * Nr, + // "Tiled GMEM Transpose with sub-group shuffle functions"); + + // Tiled Transpose using the sub-group shuffle function + // where loads and stores are to shared local memory + auto tiled_subgroup_shuffle = [&]() { + q.submit([&](sycl::handler &cgh) { + sycl::accessor d_idata{h_idata, cgh, sycl::read_only}; + sycl::accessor d_odata{h_odata, cgh, sycl::write_only, + sycl::property::no_init{}}; + // sub-group size == data block size in smem to be transposed + constexpr size_t BLOCK_SIZE = 16; + sycl::range tileRange{TILE_DIM, TILE_DIM}; + sycl::local_accessor sMemTile{tileRange, cgh}; + sycl::local_accessor sMemTileTransposed{tileRange, cgh}; + + cgh.parallel_for( + ndRange, + [=](sycl::nd_item<2> item) [[sycl::reqd_sub_group_size(16)]] { + auto globalId = item.get_global_id(); + auto localId = item.get_local_id(); + + auto groupOffset_0 = item.get_group(0) * TILE_DIM; + auto groupOffset_1 = item.get_group(1) * TILE_DIM; + + auto row_id = groupOffset_0 + localId[0]; + auto col_id = groupOffset_1 + localId[1]; + + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + sycl::id dataIdx{row_id + i, col_id}; + sycl::id smemTileIdx{localId[0] + i, localId[1]}; + + // coalesced read from gmem into smem + sMemTile[smemTileIdx] = d_idata[dataIdx]; + } + + // Now sMem holds a 32x32 size data tile that we + // transpose with sub-group select function + // Each 1x16 sub-group of work-item can transpose one + // 16x16 tile in the SMEM. So we need to loop over the + // process two times to transpose an entire 32x32 tile + { + auto sg = item.get_sub_group(); + uint sgId = sg.get_local_id()[0]; + + float bcol[BLOCK_SIZE]; + int ai = BLOCK_SIZE * 0; + int aj = BLOCK_SIZE * sg.get_group_id(); + + for (uint k = 0; k < BLOCK_SIZE; k++) { + // load columns of the matrix + // tile into the subgroup + bcol[k] = sg.load(get_accessor_pointer(sMemTile) + + (ai + k) * TILE_DIM + aj); + } + + float tcol[BLOCK_SIZE]; + for (uint n = 0; n < BLOCK_SIZE; n++) { + if (sgId == n) { + for (uint k = 0; k < BLOCK_SIZE; k++) { + // returns the value of bcol[n] from the k-th + // work-item + tcol[k] = sycl::select_from_group(sg, bcol[n], k); + } + } + } + + for (uint k = 0; k < BLOCK_SIZE; k++) { + sg.store(get_accessor_pointer(sMemTileTransposed) + + (aj + k) * TILE_DIM + ai, + tcol[k]); + } + } + + { + auto sg = item.get_sub_group(); + uint sgId = sg.get_local_id()[0]; + + float bcol[BLOCK_SIZE]; + int ai = BLOCK_SIZE * 1; + int aj = BLOCK_SIZE * sg.get_group_id(); + + for (uint k = 0; k < BLOCK_SIZE; k++) { + // load columns of the matrix + // tile into the subgroup + bcol[k] = sg.load(get_accessor_pointer(sMemTile) + + (ai + k) * TILE_DIM + aj); + } + + float tcol[BLOCK_SIZE]; + for (uint n = 0; n < BLOCK_SIZE; n++) { + if (sgId == n) { + for (uint k = 0; k < BLOCK_SIZE; k++) { + // returns the value of bcol[n] from the k-th + // work-item + tcol[k] = sycl::select_from_group(sg, bcol[n], k); + } + } + } + + for (uint k = 0; k < BLOCK_SIZE; k++) { + sg.store(get_accessor_pointer(sMemTileTransposed) + + (aj + k) * TILE_DIM + ai, + tcol[k]); + } + } + + // We need to wait here to ensure that all work items + // have written to local memory before we start reading + // from it. + sycl::group_barrier(item.get_group()); + + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + sycl::id dataIdx{groupOffset_1 + localId[0] + i, + groupOffset_0 + localId[1]}; + sycl::id smemTileIdx{localId[0] + i, localId[1]}; + + // coalesced write to gmem from smem + d_odata[dataIdx] = sMemTileTransposed[smemTileIdx]; + } + }); + }); + q.wait_and_throw(); + }; + util::benchmark(simple_copy, numIters, Nc * Nr, + "Simple Non-Coalesced Tiled Copy"); + util::benchmark(simple_coalesced_copy, numIters, Nc * Nr, + "Simple Tiled Copy"); + util::benchmark(naive_transpose, numIters, Nc * Nr, "Naive Transpose"); + util::benchmark(smem_copy, numIters, Nc * Nr, "Tiled SMEM Copy"); + util::benchmark(smem_transpose, numIters, Nc * Nr, + "Tiled SMEM Transpose"); + util::benchmark(smem_transpose_no_bank_conflict, numIters, Nc * Nr, + "Tiled SMEM Transpose avoiding Bank Conflict"); + util::benchmark(tiled_subgroup_shuffle, numIters, Nc * Nr, + "Tiled SMEM Transpose with sub-group shuffle functions"); + } + } catch (const sycl::exception &e) { + std::cout << "Exception caught: " << e.what() << std::endl; + } From 3578f8e5ef50da67a099feedfbac0830a408db94 Mon Sep 17 00:00:00 2001 From: Suryaprakash Shanmugam Date: Tue, 28 Oct 2025 22:38:58 -0700 Subject: [PATCH 3/4] Add transposed load kernel through the new Xe Copy Atoms --- examples/cute/tutorial/CMakeLists.txt | 2 +- .../transpose/block_2d_transposed_copy.h | 148 ++++++++++++++++ .../cute/tutorial/transpose/copy_direct.h | 158 ++++++++++-------- examples/cute/tutorial/transpose/copy_smem.h | 8 +- examples/cute/tutorial/transpose/main.cpp | 5 + .../cute/tutorial/transpose/transpose_naive.h | 4 - .../cute/tutorial/transpose/transpose_smem.h | 12 +- 7 files changed, 254 insertions(+), 83 deletions(-) create mode 100644 examples/cute/tutorial/transpose/block_2d_transposed_copy.h diff --git a/examples/cute/tutorial/CMakeLists.txt b/examples/cute/tutorial/CMakeLists.txt index d868a21e1b..cff90792ad 100644 --- a/examples/cute/tutorial/CMakeLists.txt +++ b/examples/cute/tutorial/CMakeLists.txt @@ -47,7 +47,7 @@ if (CUTLASS_ENABLE_SYCL) cutlass_example_add_executable( cute_tutorial_tiled_transpose - transpose/tiled_transpose_sycl.cpp + transpose/main.cpp ) cutlass_example_add_executable( diff --git a/examples/cute/tutorial/transpose/block_2d_transposed_copy.h b/examples/cute/tutorial/transpose/block_2d_transposed_copy.h new file mode 100644 index 0000000000..d061df9af0 --- /dev/null +++ b/examples/cute/tutorial/transpose/block_2d_transposed_copy.h @@ -0,0 +1,148 @@ +#pragma once +/*************************************************************************************************** + * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. Copyright (C) 2025 Intel Corporation, All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#include +#include +#include +#include + +#include "cutlass/util/print_error.hpp" +#include "util.h" + +template +void block2DTransposedLoadKernel(TensorS const S, TensorD const DT, + BlockShape const block_shape, + BlockShapeTrans const block_shape_transposed, + ThreadLayout const thread_layout) { + using namespace cute; + using Element = typename TensorS::value_type; + + /* get workgroup and local ids */ + auto item = sycl::ext::oneapi::this_work_item::get_nd_item<2>(); + auto wg_m = int(item.get_group(0)); + auto wg_n = int(item.get_group(1)); + auto local_id = int(item.get_local_id(0)); + + /* proxy coordinate tensor */ + Tensor cS = make_identity_tensor(S.shape()); // (M,N) + Tensor cDT = make_identity_tensor(DT.shape()); // (N,M) + + auto wg_coord = make_coord(wg_m, wg_n); + auto wg_coord_transposed = make_coord(wg_n, wg_m); + + // Tensor data = ... // ( M, N) Tensor cta_data = local_tile(data, + // Shape<16, 16>{}, make_coord(blockIdx.x,blockIdx.y)); // (_32,_64) + Tensor gS = local_tile(cS, block_shape, wg_coord); // (BLK_M,BLK_N) + Tensor gDT = local_tile(cDT, block_shape_transposed, + wg_coord_transposed); // (BLK_N,BLK_M); + + constexpr int CopyBits = sizeof_bits_v; + auto transposed_load_op = XE_LOAD_2D_TRANSPOSE{}; + auto store_op = XE_STORE_2D{}; + + /* Slice TiledCopy operations to thread (work-item) level */ + auto transpose_S = make_block_2d_copy(transposed_load_op, S); + auto thr_transpose_S = transpose_S.get_slice(local_id); + + auto store_DT = make_block_2d_copy(store_op, DT); + auto thr_copy_DT = store_DT.get_slice(local_id); + + /* Register fragments for transposed copy */ + auto tSrS = thr_transpose_S.partition_sg_fragment_D(gS); + auto tDrD = thr_copy_DT.partition_sg_fragment_D(gDT); + + /* Partition global tensor (proxies) for copies */ + Tensor tSgS = thr_transpose_S.partition_S(gS); + Tensor tDgD = thr_copy_DT.partition_D(gDT); + + // if ( cute::thread(0, 0)){ + // print(tSgS);print("\n"); + // print(tSrS);print("\n"); + // print(tDgD);print("\n"); + // } + + copy(transpose_S, tSgS, tSrS); + // copy(tSrS, tDrD); + copy(store_DT, tSrS, tDgD); +} + +class TransposeCuteName; +template +void block_2d_transposed_copy(TransposeParams params) { + + using namespace cute; + // + // Make Tensors + // + auto tensor_shape = make_shape(params.M, params.N); + auto tensor_shape_trans = make_shape(params.N, params.M); + auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{}); + auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{}); + Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS); + Tensor tensor_DT = make_tensor(make_gmem_ptr(params.output), gmemLayoutD); + + // Make a transposed view of the output + // auto gmemLayoutDT = make_layout(tensor_shape, GenColMajor{}); + // Tensor tensor_DT = make_tensor(make_gmem_ptr(params.output), gmemLayoutDT); + + sycl::queue Q; + + // + // Tile tensors + // + + using bM = Int<32>; + using bN = Int<8>; + + auto block_shape = make_shape(bM{}, bN{}); // (bM, bN) + auto block_shape_trans = make_shape(bN{}, bM{}); // (bN, bM) + + sycl::range<2> local = {bM{}, 1}; + sycl::range<2> global = {local[0] * ceil_div(shape<0>(tensor_S), bM{}), + local[1] * ceil_div(shape<1>(tensor_S), bN{})}; + + auto threadLayout = make_layout(make_shape(bM{}, Int<1>{}), LayoutRight{}); + + namespace syclex = sycl::ext::oneapi::experimental; + namespace intelex = sycl::ext::intel::experimental; + + syclex::properties kernel_props{syclex::sub_group_size<16>, + intelex::grf_size<256>}; + + auto event = Q.parallel_for( + sycl::nd_range<2>(global, local), kernel_props, [=](auto) { + block2DTransposedLoadKernel(tensor_S, tensor_DT, block_shape, + block_shape_trans, threadLayout); + }); +}; diff --git a/examples/cute/tutorial/transpose/copy_direct.h b/examples/cute/tutorial/transpose/copy_direct.h index 10861b360e..a4d255bfd4 100644 --- a/examples/cute/tutorial/transpose/copy_direct.h +++ b/examples/cute/tutorial/transpose/copy_direct.h @@ -51,6 +51,22 @@ void copy_kernel(TensorS S, TensorD D, ThreadLayout) { using namespace cute; // Slice the tiled tensors + // This line slices the tiled tensor S to get the tile for the current work + // group. S is a 3D tensor with layout ((M, N), m', n') where: + // - (M, N) is the block/tile shape (first mode) + // - m' is the number of tiles in the M dimension (second mode) + // - n' is the number of tiles in the N dimension (third mode) + // + // The indexing S(make_coord(_, _), x, y) selects: + // - make_coord(_, _): Takes all elements from the first mode (M, N), i.e., + // the entire tile + // - compat::work_group_id::x(): Selects the x-th tile along the m' + // dimension + // - compat::work_group_id::y(): Selects the y-th tile along the n' + // dimension + // + // Result: A 2D tensor of shape (BlockShape_M, BlockShape_N) corresponding to + // the tile assigned to the current work group. Tensor tile_S = S(make_coord(_, _), compat::work_group_id::x(), compat::work_group_id::y()); // (BlockShape_M, BlockShape_N) Tensor tile_D = D(make_coord(_, _), compat::work_group_id::x(), @@ -64,78 +80,78 @@ void copy_kernel(TensorS S, TensorD D, ThreadLayout) { tile_S, ThreadLayout{}, compat::local_id::x()); // (ThrValM, ThrValN) Tensor thr_tile_D = local_partition( tile_D, ThreadLayout{}, compat::local_id::x()); // (ThrValM, ThrValN) - // - - // Construct a register-backed Tensor with the same shape as each thread's - // partition Use make_tensor to try to match the layout of thr_tile_S - Tensor fragment = make_tensor_like(thr_tile_S); // (ThrValM, ThrValN) - - // Copy from GMEM to RMEM and from RMEM to GMEM - copy(thr_tile_S, fragment); - copy(fragment, thr_tile_D); -} - -template void copy_direct(TransposeParams params) { - // - // Given a 2D shape, perform an efficient copy - // - - using namespace cute; - - // - // Make tensors - // - auto tensor_shape = make_shape(params.M, params.N); - auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{}); - auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{}); - Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS); - Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD); - - // - // Tile tensors - // - // Define a statically sized block (M, N). - // Note, by convention, capital letters are used to represent static modes. - auto block_shape = make_shape(Int<1>{}, Int<16384>{}); + // Construct a register-backed Tensor with the same shape as each thread's + // partition Use make_tensor to try to match the layout of thr_tile_S + Tensor fragment = make_tensor_like(thr_tile_S); // (ThrValM, ThrValN) - if ((size<0>(tensor_shape) % size<0>(block_shape)) || - (size<1>(tensor_shape) % size<1>(block_shape))) { - std::cerr << "The tensor shape must be divisible by the block shape." - << std::endl; - } - // Equivalent check to the above - if (not evenly_divides(tensor_shape, block_shape)) { - std::cerr << "Expected the block_shape to evenly divide the tensor shape." - << std::endl; + // Copy from GMEM to RMEM and from RMEM to GMEM + copy(thr_tile_S, fragment); + copy(fragment, thr_tile_D); } - // Tile the tensor (m, n) ==> ((M, N), m', n') where (M, N) is the static tile - // shape, and modes (m', n') correspond to the number of tiles. - // - // These will be used to determine the CUDA kernel grid dimensions. - Tensor tiled_tensor_S = - tiled_divide(tensor_S, block_shape); // ((M, N), m', n') - Tensor tiled_tensor_D = - tiled_divide(tensor_D, block_shape); // ((M, N), m', n') - - // Thread arrangement - Layout thr_layout = - make_layout(make_shape(Int<1>{}, Int<1024>{}), LayoutRight{}); - - // - // Determine grid and block dimensions - // - - auto gridDim = compat::dim3( - size<1>(tiled_tensor_S), - size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n' - auto blockDim = compat::dim3(size(thr_layout)); - - // - // Launch the kernel - // - compat::launch>( - gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, thr_layout); -} + template + void copy_direct(TransposeParams params) { + // + // Given a 2D shape, perform an efficient copy + // + + using namespace cute; + + // + // Make tensors + // + auto tensor_shape = make_shape(params.M, params.N); + auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{}); + auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{}); + Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS); + Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD); + + // + // Tile tensors + // + + // Define a statically sized block (M, N). + // Note, by convention, capital letters are used to represent static modes. + auto block_shape = make_shape(Int<1>{}, Int<16384>{}); + + if ((size<0>(tensor_shape) % size<0>(block_shape)) || + (size<1>(tensor_shape) % size<1>(block_shape))) { + std::cerr << "The tensor shape must be divisible by the block shape." + << std::endl; + } + // Equivalent check to the above + if (not evenly_divides(tensor_shape, block_shape)) { + std::cerr << "Expected the block_shape to evenly divide the tensor shape." + << std::endl; + } + + // Tile the tensor (m, n) ==> ((M, N), m', n') where (M, N) is the static + // tile shape, and modes (m', n') correspond to the number of tiles. + // + // These will be used to determine the CUDA kernel grid dimensions. + Tensor tiled_tensor_S = + tiled_divide(tensor_S, block_shape); // ((M, N), m', n') + Tensor tiled_tensor_D = + tiled_divide(tensor_D, block_shape); // ((M, N), m', n') + + // Thread arrangement + Layout thr_layout = + make_layout(make_shape(Int<1>{}, Int<1024>{}), LayoutRight{}); + + // + // Determine grid and block dimensions + // + + auto gridDim = compat::dim3( + size<1>(tiled_tensor_S), + size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n' + auto blockDim = compat::dim3(size(thr_layout)); + + // + // Launch the kernel + // + compat::launch>( + gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, thr_layout); + } diff --git a/examples/cute/tutorial/transpose/copy_smem.h b/examples/cute/tutorial/transpose/copy_smem.h index 18f3fef90d..05399f8b64 100644 --- a/examples/cute/tutorial/transpose/copy_smem.h +++ b/examples/cute/tutorial/transpose/copy_smem.h @@ -62,10 +62,10 @@ void copySmemKernel(TensorS const S, TensorD const D, ThreadLayout, Tensor gS = S(make_coord(_, _), compat::work_group_id::x(), compat::work_group_id::y()); // (bM, bN) Tensor gD = D(make_coord(_, _), compat::work_group_id::x(), - compat::work_group_id::y()); // (bN, bM) + compat::work_group_id::y()); // (bM, bN) Tensor sS = make_tensor(make_smem_ptr(shared_storage.smem.data()), - SmemLayout{}); // (bN, bM) + SmemLayout{}); // (bM, bN) auto tiled_copy_load = make_tiled_copy( Copy_Atom, Element>{}, @@ -83,7 +83,7 @@ void copySmemKernel(TensorS const S, TensorD const D, ThreadLayout, Tensor tSgS = thr_copy_load.partition_S(gS); Tensor tSsS = thr_copy_load.partition_D(sS); // - Tensor tDsS = thr_copy_store.partition_D(sS); + Tensor tDsD = thr_copy_store.partition_S(sS); Tensor tDgD = thr_copy_store.partition_D(gD); copy(tiled_copy_load, tSgS, tSsS); @@ -92,7 +92,7 @@ void copySmemKernel(TensorS const S, TensorD const D, ThreadLayout, cp_async_wait<0>(); syncthreads(); // - copy(tiled_copy_store, tDsS, tDgD); + copy(tiled_copy_store, tDsD, tDgD); } template void copy_smem(TransposeParams params) { diff --git a/examples/cute/tutorial/transpose/main.cpp b/examples/cute/tutorial/transpose/main.cpp index c179fa6485..b4c4e61275 100644 --- a/examples/cute/tutorial/transpose/main.cpp +++ b/examples/cute/tutorial/transpose/main.cpp @@ -1,3 +1,4 @@ +#include "block_2d_transposed_copy.h" #include "copy_direct.h" #include "copy_smem.h" #include "transpose_naive.h" @@ -25,5 +26,9 @@ int main(int argc, char const **argv) { printf("\nTranspose through SMEM.:\n"); benchmark(transpose_smem, M, N, iterations); + printf("Block 2d Transposed load\n"); + benchmark(block_2d_transposed_copy, M, + N, iterations); + return 0; } diff --git a/examples/cute/tutorial/transpose/transpose_naive.h b/examples/cute/tutorial/transpose/transpose_naive.h index d73203a1af..70aed2cdf3 100644 --- a/examples/cute/tutorial/transpose/transpose_naive.h +++ b/examples/cute/tutorial/transpose/transpose_naive.h @@ -69,11 +69,8 @@ void transpose_naive(TransposeParams params) { // Make Tensors // auto tensor_shape = make_shape(params.M, params.N); - auto tensor_shape_trans = make_shape(params.N, params.M); auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{}); - auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{}); Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS); - Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD); // Make a transposed view of the output auto gmemLayoutDT = make_layout(tensor_shape, GenColMajor{}); @@ -87,7 +84,6 @@ void transpose_naive(TransposeParams params) { using bN = Int<512>; auto block_shape = make_shape(bM{}, bN{}); // (bM, bN) - auto block_shape_trans = make_shape(bN{}, bM{}); // (bN, bM) Tensor tiled_tensor_S = tiled_divide(tensor_S, block_shape); // ((bM, bN), m', n') diff --git a/examples/cute/tutorial/transpose/transpose_smem.h b/examples/cute/tutorial/transpose/transpose_smem.h index 51aa1bddba..b251df97f2 100644 --- a/examples/cute/tutorial/transpose/transpose_smem.h +++ b/examples/cute/tutorial/transpose/transpose_smem.h @@ -44,9 +44,7 @@ // Shared Storage for aligned addresses template struct SharedStorageTranspose { - cute::array_aligned, - cutlass::detail::alignment_for_swizzle(SmemLayout{})> - smem; + cute::array_aligned> smem; }; template params) { auto tileShapeS = make_layout(block_shape, LayoutRight{}); auto tileShapeD = make_layout(block_shape_trans, LayoutRight{}); + // smemLayoutS uses the same layout as tileShapeS (row-major layout of shape (bM, bN)) + // This represents how data from the source tile is stored in shared memory auto smemLayoutS = tileShapeS; + + // smemLayoutD composes smemLayoutS with tileShapeD to create the transposed layout + // composition(smemLayoutS, tileShapeD) means: apply tileShapeD's coordinate mapping first, + // then apply smemLayoutS's layout. Since tileShapeD has shape (bN, bM), this effectively + // creates a layout that maps (bN, bM) coordinates through the (bM, bN) storage, achieving + // the transpose operation when reading from shared memory in the opposite dimension order auto smemLayoutD = composition(smemLayoutS, tileShapeD); auto smemLayoutS_swizzle = composition(Swizzle<5, 0, 5>{}, tileShapeS); auto smemLayoutD_swizzle = composition(smemLayoutS_swizzle, tileShapeD); From 35b1308f9bbfd1d4f8ff2548652e80e1abfb33d1 Mon Sep 17 00:00:00 2001 From: Suryaprakash Shanmugam Date: Wed, 5 Nov 2025 22:43:28 -0800 Subject: [PATCH 4/4] Add block2dcopy test code --- .../cute/tutorial/transpose/block_2d_copy.h | 193 ++++++++++++++++++ .../transpose/block_2d_transposed_copy.h | 148 -------------- 2 files changed, 193 insertions(+), 148 deletions(-) create mode 100644 examples/cute/tutorial/transpose/block_2d_copy.h delete mode 100644 examples/cute/tutorial/transpose/block_2d_transposed_copy.h diff --git a/examples/cute/tutorial/transpose/block_2d_copy.h b/examples/cute/tutorial/transpose/block_2d_copy.h new file mode 100644 index 0000000000..696edeba31 --- /dev/null +++ b/examples/cute/tutorial/transpose/block_2d_copy.h @@ -0,0 +1,193 @@ +#pragma once +/*************************************************************************************************** + * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. Copyright (C) 2025 Intel Corporation, All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#include +#include +#include +#include + +#include "cute/stride.hpp" +#include "cute/swizzle_layout.hpp" +#include "cutlass/util/print_error.hpp" +#include "util.h" + +template +void block2DCopyKernel(TensorS const S, + BlockShape const block_shape, + TVLayout const sv_layout) { + using namespace cute; + using Element = typename TensorS::value_type; + + // get the m,n workgroup ids (thread-block indices) + // and the local id (threadIdx.x) of this thread */ + auto item = sycl::ext::oneapi::this_work_item::get_nd_item<2>(); + auto wg_m = int(item.get_group(0)); + auto wg_n = int(item.get_group(1)); + auto local_id = int(item.get_local_id(0)); + + // create a coordinate tensor of the input matrix; + // to be used in copy atom operations + Tensor cS = make_identity_tensor(S.shape()); // (M,N) + + // create a wg coordinate to slice the input matrix tile + auto wg_coord = make_coord(wg_m, wg_n); + + Tensor gS = local_tile(cS, block_shape, wg_coord); // (BLK_M,BLK_N) + + constexpr int CopyBits = sizeof_bits_v; + constexpr int Width = 8; + constexpr int Height = 8; + + // auto transposed_load_op = XE_LOAD_2D_TRANSPOSE{}; + auto copy_op = XE_LOAD_2D{}; + + auto S_stride = S.stride(); + auto x_mode = find_x_mode(S_stride); + auto y_mode = find_y_mode(S_stride); + + using CopyOp = decltype(copy_op); + using XMode = decltype(x_mode); + using YMode = decltype(y_mode); + + // Divide coordinate codomain into copy tiles. + auto op_tile = Int{} * E{} + + Int{} * E{}; + auto atom_shape = shape_div(block_shape, op_tile); + + /* Slice TiledCopy operations to thread (work-item) level */ + auto copy_S = make_block_2d_copy(copy_op, + S_stride, + x_mode, + y_mode, + atom_shape, + sv_layout.layout()); + auto thr_copy_S = copy_S.get_slice(local_id); + + /* Register fragments for transposed copy */ + auto tSrS = thr_copy_S.partition_sg_fragment_D(gS); + + /* Partition global tensor (proxies) for copies */ + Tensor tSgS = thr_copy_S.partition_S(gS); + + if (thread0()){ + print("block_shape: "); print(block_shape); print("\n"); + print("atom_shape: "); print(atom_shape); print("\n"); + + print("S: "); print(S);print("\n"); + print("cS: "); print(cS);print("\n"); + print("gS: ");print(gS);print("\n\n"); + + print("transpose_S: "); print(copy_S);print("\n\n"); + print("thr_transpose_S: "); print(thr_copy_S);print("\n\n"); + print("tSgS: ");print(tSgS);print("\n"); + print("tSrS: "); print(tSrS);print("\n\n"); + } + + copy(copy_S, tSgS, tSrS); +} + +class CopyCuteName; +template +void block_2d_copy(TransposeParams params) { + + using namespace cute; + // + // Make Tensors + // + auto tensor_shape = make_shape(params.M, params.N); + auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{}); + Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS); + + sycl::queue Q; + + // Tile tensors + // + + using bM = Int<8>; + using bN = Int<8>; + + auto block_shape = make_shape(bM{}, bN{}); // (bM, bN) + + sycl::range<2> local = {1, 16}; // 1 sub-groups; keep the subgroup contiguous in the x-axis + sycl::range<2> global = {local[0] * ceil_div(shape<0>(tensor_S), bM{}), + local[1] * ceil_div(shape<1>(tensor_S), bN{})}; + + // Create a mapping from (S1, V64) -> (M8, N8) + // Each sub-group owns the copy for 64 elements through the 8x8 vectorized copy atom + // Here we just have 1 sub-group to perform a copy of a 8x8 block where each work-item + // is responsible to copy 4 elements + constexpr int SubgroupSize = 16; + // constexpr int NumSubgroups = get<0>(block_shape); // 1 + constexpr int NumSubgroups = 1; + + // In NVIDIA-CuTe, we use the tv-layout to index into the thread-mode to + // achieve the mapping from thread idx to all the linear value indices that the + // thread is responsible for. + // Similarly, we aim to achieve a mapping from subgroup idx to linear value indices. + // Contiguous thread index within a subgroup maps to mode-1 (or x-axis) because the + // input layout is contiguous on mode-1 for coalesced load-store + // SGV layout map subgroup & value index to (8, 8) logical tile + auto sg_shape = make_shape(Int{}, _1{}); + using sg_layout = decltype(make_layout(sg_shape, Stride<_1, _0>{})); + using val_layout = decltype(Layout, Stride<_8,_1>>{}); + // Layout for subgroups tiling the workgroup tile + + // Following was taken from a make_tiled_copy overload that computes + // the equivalent of this in CuTe DSL + // tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout) + + // Take the raked_products to compute the Layout_MN + // (M,N) -> (thr_idx, val_idx) + auto layout_mn = raked_product(sg_layout{}, val_layout{}); + // (thr_idx, val_idx) -> (M,N) + auto sv_layout = right_inverse(layout_mn).with_shape(make_shape(size(sg_layout{}), size(val_layout{}))); + +#if 1 +print("sg_layout: "); print(sg_layout{}); print("\n"); +print("val_layout: "); print(val_layout{}); print("\n"); +print("layout_mn : "); print(layout_mn); print("\n"); +print("sv_layout: "); print(sv_layout); print("\n"); +#endif + + namespace syclex = sycl::ext::oneapi::experimental; + namespace intelex = sycl::ext::intel::experimental; + + syclex::properties kernel_props{syclex::sub_group_size<16>, + intelex::grf_size<256>}; + + auto event = Q.parallel_for( + sycl::nd_range<2>(global, local), kernel_props, [=](auto) { + block2DCopyKernel(tensor_S, block_shape, + sv_layout); + }); +}; diff --git a/examples/cute/tutorial/transpose/block_2d_transposed_copy.h b/examples/cute/tutorial/transpose/block_2d_transposed_copy.h deleted file mode 100644 index d061df9af0..0000000000 --- a/examples/cute/tutorial/transpose/block_2d_transposed_copy.h +++ /dev/null @@ -1,148 +0,0 @@ -#pragma once -/*************************************************************************************************** - * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights - * reserved. Copyright (C) 2025 Intel Corporation, All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -#include -#include -#include -#include - -#include "cutlass/util/print_error.hpp" -#include "util.h" - -template -void block2DTransposedLoadKernel(TensorS const S, TensorD const DT, - BlockShape const block_shape, - BlockShapeTrans const block_shape_transposed, - ThreadLayout const thread_layout) { - using namespace cute; - using Element = typename TensorS::value_type; - - /* get workgroup and local ids */ - auto item = sycl::ext::oneapi::this_work_item::get_nd_item<2>(); - auto wg_m = int(item.get_group(0)); - auto wg_n = int(item.get_group(1)); - auto local_id = int(item.get_local_id(0)); - - /* proxy coordinate tensor */ - Tensor cS = make_identity_tensor(S.shape()); // (M,N) - Tensor cDT = make_identity_tensor(DT.shape()); // (N,M) - - auto wg_coord = make_coord(wg_m, wg_n); - auto wg_coord_transposed = make_coord(wg_n, wg_m); - - // Tensor data = ... // ( M, N) Tensor cta_data = local_tile(data, - // Shape<16, 16>{}, make_coord(blockIdx.x,blockIdx.y)); // (_32,_64) - Tensor gS = local_tile(cS, block_shape, wg_coord); // (BLK_M,BLK_N) - Tensor gDT = local_tile(cDT, block_shape_transposed, - wg_coord_transposed); // (BLK_N,BLK_M); - - constexpr int CopyBits = sizeof_bits_v; - auto transposed_load_op = XE_LOAD_2D_TRANSPOSE{}; - auto store_op = XE_STORE_2D{}; - - /* Slice TiledCopy operations to thread (work-item) level */ - auto transpose_S = make_block_2d_copy(transposed_load_op, S); - auto thr_transpose_S = transpose_S.get_slice(local_id); - - auto store_DT = make_block_2d_copy(store_op, DT); - auto thr_copy_DT = store_DT.get_slice(local_id); - - /* Register fragments for transposed copy */ - auto tSrS = thr_transpose_S.partition_sg_fragment_D(gS); - auto tDrD = thr_copy_DT.partition_sg_fragment_D(gDT); - - /* Partition global tensor (proxies) for copies */ - Tensor tSgS = thr_transpose_S.partition_S(gS); - Tensor tDgD = thr_copy_DT.partition_D(gDT); - - // if ( cute::thread(0, 0)){ - // print(tSgS);print("\n"); - // print(tSrS);print("\n"); - // print(tDgD);print("\n"); - // } - - copy(transpose_S, tSgS, tSrS); - // copy(tSrS, tDrD); - copy(store_DT, tSrS, tDgD); -} - -class TransposeCuteName; -template -void block_2d_transposed_copy(TransposeParams params) { - - using namespace cute; - // - // Make Tensors - // - auto tensor_shape = make_shape(params.M, params.N); - auto tensor_shape_trans = make_shape(params.N, params.M); - auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{}); - auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{}); - Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS); - Tensor tensor_DT = make_tensor(make_gmem_ptr(params.output), gmemLayoutD); - - // Make a transposed view of the output - // auto gmemLayoutDT = make_layout(tensor_shape, GenColMajor{}); - // Tensor tensor_DT = make_tensor(make_gmem_ptr(params.output), gmemLayoutDT); - - sycl::queue Q; - - // - // Tile tensors - // - - using bM = Int<32>; - using bN = Int<8>; - - auto block_shape = make_shape(bM{}, bN{}); // (bM, bN) - auto block_shape_trans = make_shape(bN{}, bM{}); // (bN, bM) - - sycl::range<2> local = {bM{}, 1}; - sycl::range<2> global = {local[0] * ceil_div(shape<0>(tensor_S), bM{}), - local[1] * ceil_div(shape<1>(tensor_S), bN{})}; - - auto threadLayout = make_layout(make_shape(bM{}, Int<1>{}), LayoutRight{}); - - namespace syclex = sycl::ext::oneapi::experimental; - namespace intelex = sycl::ext::intel::experimental; - - syclex::properties kernel_props{syclex::sub_group_size<16>, - intelex::grf_size<256>}; - - auto event = Q.parallel_for( - sycl::nd_range<2>(global, local), kernel_props, [=](auto) { - block2DTransposedLoadKernel(tensor_S, tensor_DT, block_shape, - block_shape_trans, threadLayout); - }); -};