Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions examples/common/sycl_cute_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,22 @@ zero_fill(InTensor &X)
X(i) = T(0);
}

template <typename T>
void
random_fill(std::vector<T> &X) {

for (int i = 0; i < X.size(); i++)
X[i] = random_value<T>();
}

template <typename T>
void
zero_fill(std::vector<T> &X) {
for (int i = 0; i < X.size(); i++)
X[i] = T(0);
}


// Pack sub-byte types in a gmem tensor.
// On input, the backing array holds one sub-byte value per byte.
// On exit, the backing array contains packed values.
Expand Down
5 changes: 5 additions & 0 deletions examples/cute/tutorial/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ if (CUTLASS_ENABLE_SYCL)
tiled_copy_sycl.cpp
)

cutlass_example_add_executable(
cute_tutorial_tiled_transpose
transpose/main.cpp
)

cutlass_example_add_executable(
cute_tutorial_tiled_copy_if
tiled_copy_if_sycl.cpp
Expand Down
193 changes: 193 additions & 0 deletions examples/cute/tutorial/transpose/block_2d_copy.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
#pragma once
/***************************************************************************************************
* Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
* reserved. Copyright (C) 2025 Intel Corporation, All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <cute/tensor.hpp>
#include <cute/util/compat.hpp>
#include <sycl/ext/intel/experimental/grf_size_properties.hpp>
#include <sycl/sycl.hpp>

#include "cute/stride.hpp"
#include "cute/swizzle_layout.hpp"
#include "cutlass/util/print_error.hpp"
#include "util.h"

template <class TensorS, class BlockShape, class TVLayout>
void block2DCopyKernel(TensorS const S,
BlockShape const block_shape,
TVLayout const sv_layout) {
using namespace cute;
using Element = typename TensorS::value_type;

// get the m,n workgroup ids (thread-block indices)
// and the local id (threadIdx.x) of this thread */
auto item = sycl::ext::oneapi::this_work_item::get_nd_item<2>();
auto wg_m = int(item.get_group(0));
auto wg_n = int(item.get_group(1));
auto local_id = int(item.get_local_id(0));

// create a coordinate tensor of the input matrix;
// to be used in copy atom operations
Tensor cS = make_identity_tensor(S.shape()); // (M,N)

// create a wg coordinate to slice the input matrix tile
auto wg_coord = make_coord(wg_m, wg_n);

Tensor gS = local_tile(cS, block_shape, wg_coord); // (BLK_M,BLK_N)

constexpr int CopyBits = sizeof_bits_v<Element>;
constexpr int Width = 8;
constexpr int Height = 8;

// auto transposed_load_op = XE_LOAD_2D_TRANSPOSE<CopyBits, Width, Height>{};
auto copy_op = XE_LOAD_2D<CopyBits, Width, Height>{};

auto S_stride = S.stride();
auto x_mode = find_x_mode(S_stride);
auto y_mode = find_y_mode(S_stride);

using CopyOp = decltype(copy_op);
using XMode = decltype(x_mode);
using YMode = decltype(y_mode);

// Divide coordinate codomain into copy tiles.
auto op_tile = Int<Width>{} * E<XMode::value>{}
+ Int<Height>{} * E<YMode::value>{};
auto atom_shape = shape_div(block_shape, op_tile);

/* Slice TiledCopy operations to thread (work-item) level */
auto copy_S = make_block_2d_copy<Element>(copy_op,
S_stride,
x_mode,
y_mode,
atom_shape,
sv_layout.layout());
auto thr_copy_S = copy_S.get_slice(local_id);

/* Register fragments for transposed copy */
auto tSrS = thr_copy_S.partition_sg_fragment_D(gS);

/* Partition global tensor (proxies) for copies */
Tensor tSgS = thr_copy_S.partition_S(gS);

if (thread0()){
print("block_shape: "); print(block_shape); print("\n");
print("atom_shape: "); print(atom_shape); print("\n");

print("S: "); print(S);print("\n");
print("cS: "); print(cS);print("\n");
print("gS: ");print(gS);print("\n\n");

print("transpose_S: "); print(copy_S);print("\n\n");
print("thr_transpose_S: "); print(thr_copy_S);print("\n\n");
print("tSgS: ");print(tSgS);print("\n");
print("tSrS: "); print(tSrS);print("\n\n");
}

copy(copy_S, tSgS, tSrS);
}

class CopyCuteName;
template <typename Element>
void block_2d_copy(TransposeParams<Element> params) {

using namespace cute;
//
// Make Tensors
//
auto tensor_shape = make_shape(params.M, params.N);
auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);

sycl::queue Q;

// Tile tensors
//

using bM = Int<8>;
using bN = Int<8>;

auto block_shape = make_shape(bM{}, bN{}); // (bM, bN)

sycl::range<2> local = {1, 16}; // 1 sub-groups; keep the subgroup contiguous in the x-axis
sycl::range<2> global = {local[0] * ceil_div(shape<0>(tensor_S), bM{}),
local[1] * ceil_div(shape<1>(tensor_S), bN{})};

// Create a mapping from (S1, V64) -> (M8, N8)
// Each sub-group owns the copy for 64 elements through the 8x8 vectorized copy atom
// Here we just have 1 sub-group to perform a copy of a 8x8 block where each work-item
// is responsible to copy 4 elements
constexpr int SubgroupSize = 16;
// constexpr int NumSubgroups = get<0>(block_shape); // 1
constexpr int NumSubgroups = 1;

// In NVIDIA-CuTe, we use the tv-layout to index into the thread-mode to
// achieve the mapping from thread idx to all the linear value indices that the
// thread is responsible for.
// Similarly, we aim to achieve a mapping from subgroup idx to linear value indices.
// Contiguous thread index within a subgroup maps to mode-1 (or x-axis) because the
// input layout is contiguous on mode-1 for coalesced load-store
// SGV layout map subgroup & value index to (8, 8) logical tile
auto sg_shape = make_shape(Int<NumSubgroups>{}, _1{});
using sg_layout = decltype(make_layout(sg_shape, Stride<_1, _0>{}));
using val_layout = decltype(Layout<Shape<_8, _8>, Stride<_8,_1>>{});
// Layout for subgroups tiling the workgroup tile

// Following was taken from a make_tiled_copy overload that computes
// the equivalent of this in CuTe DSL
// tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout)

// Take the raked_products to compute the Layout_MN
// (M,N) -> (thr_idx, val_idx)
auto layout_mn = raked_product(sg_layout{}, val_layout{});
// (thr_idx, val_idx) -> (M,N)
auto sv_layout = right_inverse(layout_mn).with_shape(make_shape(size(sg_layout{}), size(val_layout{})));

#if 1
print("sg_layout: "); print(sg_layout{}); print("\n");
print("val_layout: "); print(val_layout{}); print("\n");
print("layout_mn : "); print(layout_mn); print("\n");
print("sv_layout: "); print(sv_layout); print("\n");
#endif

namespace syclex = sycl::ext::oneapi::experimental;
namespace intelex = sycl::ext::intel::experimental;

syclex::properties kernel_props{syclex::sub_group_size<16>,
intelex::grf_size<256>};

auto event = Q.parallel_for<CopyCuteName>(
sycl::nd_range<2>(global, local), kernel_props, [=](auto) {
block2DCopyKernel(tensor_S, block_shape,
sv_layout);
});
};
157 changes: 157 additions & 0 deletions examples/cute/tutorial/transpose/copy_direct.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
#pragma once

/***************************************************************************************************
* Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
* reserved. Copyright (C) 2025 Intel Corporation, All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/

// copy kernel adapted from
// https://github.com/NVIDIA/cutlass/blob/main/examples/cute/tutorial/tiled_copy.cu

#include <cute/util/compat.hpp>
#include <sycl/sycl.hpp>

#include <cute/tensor.hpp>

#include "cutlass/util/print_error.hpp"
#include "util.h"

#include <iomanip>

template <class TensorS, class TensorD, class ThreadLayout>
void copy_kernel(TensorS S, TensorD D, ThreadLayout) {
using namespace cute;

// Slice the tiled tensors
// This line slices the tiled tensor S to get the tile for the current work
// group. S is a 3D tensor with layout ((M, N), m', n') where:
// - (M, N) is the block/tile shape (first mode)
// - m' is the number of tiles in the M dimension (second mode)
// - n' is the number of tiles in the N dimension (third mode)
//
// The indexing S(make_coord(_, _), x, y) selects:
// - make_coord(_, _): Takes all elements from the first mode (M, N), i.e.,
// the entire tile
// - compat::work_group_id::x(): Selects the x-th tile along the m'
// dimension
// - compat::work_group_id::y(): Selects the y-th tile along the n'
// dimension
//
// Result: A 2D tensor of shape (BlockShape_M, BlockShape_N) corresponding to
// the tile assigned to the current work group.
Tensor tile_S = S(make_coord(_, _), compat::work_group_id::x(),
compat::work_group_id::y()); // (BlockShape_M, BlockShape_N)
Tensor tile_D = D(make_coord(_, _), compat::work_group_id::x(),
compat::work_group_id::y()); // (BlockShape_M, BlockShape_N)

// Construct a partitioning of the tile among threads with the given thread
// arrangement.

// Concept: Tensor ThrLayout ThrIndex
Tensor thr_tile_S = local_partition(
tile_S, ThreadLayout{}, compat::local_id::x()); // (ThrValM, ThrValN)
Tensor thr_tile_D = local_partition(
tile_D, ThreadLayout{}, compat::local_id::x()); // (ThrValM, ThrValN)

// Construct a register-backed Tensor with the same shape as each thread's
// partition Use make_tensor to try to match the layout of thr_tile_S
Tensor fragment = make_tensor_like(thr_tile_S); // (ThrValM, ThrValN)

// Copy from GMEM to RMEM and from RMEM to GMEM
copy(thr_tile_S, fragment);
copy(fragment, thr_tile_D);
}

template <typename Element>
void copy_direct(TransposeParams<Element> params) {
//
// Given a 2D shape, perform an efficient copy
//

using namespace cute;

//
// Make tensors
//
auto tensor_shape = make_shape(params.M, params.N);
auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{});
Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);

//
// Tile tensors
//

// Define a statically sized block (M, N).
// Note, by convention, capital letters are used to represent static modes.
auto block_shape = make_shape(Int<1>{}, Int<16384>{});

if ((size<0>(tensor_shape) % size<0>(block_shape)) ||
(size<1>(tensor_shape) % size<1>(block_shape))) {
std::cerr << "The tensor shape must be divisible by the block shape."
<< std::endl;
}
// Equivalent check to the above
if (not evenly_divides(tensor_shape, block_shape)) {
std::cerr << "Expected the block_shape to evenly divide the tensor shape."
<< std::endl;
}

// Tile the tensor (m, n) ==> ((M, N), m', n') where (M, N) is the static
// tile shape, and modes (m', n') correspond to the number of tiles.
//
// These will be used to determine the CUDA kernel grid dimensions.
Tensor tiled_tensor_S =
tiled_divide(tensor_S, block_shape); // ((M, N), m', n')
Tensor tiled_tensor_D =
tiled_divide(tensor_D, block_shape); // ((M, N), m', n')

// Thread arrangement
Layout thr_layout =
make_layout(make_shape(Int<1>{}, Int<1024>{}), LayoutRight{});

//
// Determine grid and block dimensions
//

auto gridDim = compat::dim3(
size<1>(tiled_tensor_S),
size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n'
auto blockDim = compat::dim3(size(thr_layout));

//
// Launch the kernel
//
compat::launch<copy_kernel<decltype(tiled_tensor_S),
decltype(tiled_tensor_D), decltype(thr_layout)>>(
gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, thr_layout);
}
Loading
Loading