diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h index a13ad33df29cd..6cd6f03253aea 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h @@ -39,6 +39,10 @@ class FuncOp; /// Collect a set of patterns to rewrite GlobalIdOp op within the GPU dialect. void populateGpuGlobalIdPatterns(RewritePatternSet &patterns); +/// Collect a set of patterns to rewrite SubgroupIdOp op within the GPU +/// dialect. +void populateGpuSubgroupIdPatterns(RewritePatternSet &patterns); + /// Collect a set of patterns to rewrite shuffle ops within the GPU dialect. void populateGpuShufflePatterns(RewritePatternSet &patterns); diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt index be6492a22f34f..e21fa501bae6b 100644 --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -40,6 +40,7 @@ add_mlir_dialect_library(MLIRGPUTransforms Transforms/ROCDLAttachTarget.cpp Transforms/ShuffleRewriter.cpp Transforms/SPIRVAttachTarget.cpp + Transforms/SubgroupIdRewriter.cpp Transforms/SubgroupReduceLowering.cpp OBJECT diff --git a/mlir/lib/Dialect/GPU/Transforms/GlobalIdRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/GlobalIdRewriter.cpp index 0c730df73b519..c40ddd9b15afc 100644 --- a/mlir/lib/Dialect/GPU/Transforms/GlobalIdRewriter.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/GlobalIdRewriter.cpp @@ -26,7 +26,7 @@ struct GpuGlobalIdRewriter : public OpRewritePattern { LogicalResult matchAndRewrite(gpu::GlobalIdOp op, PatternRewriter &rewriter) const override { - auto loc = op.getLoc(); + Location loc = op.getLoc(); auto dim = op.getDimension(); auto blockId = rewriter.create(loc, dim); auto blockDim = rewriter.create(loc, dim); diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp new file mode 100644 index 0000000000000..0f0df08919553 --- /dev/null +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp @@ -0,0 +1,82 @@ +//===- SubgroupIdRewriter.cpp - Implementation of SubgroupId rewriting ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements in-dialect rewriting of the gpu.subgroup_id op for archs +// where: +// subgroup_id = (tid.x + dim.x * (tid.y + dim.y * tid.z)) / subgroup_size +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/Transforms/Passes.h" +#include "mlir/Dialect/Index/IR/IndexOps.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; + +namespace { +struct GpuSubgroupIdRewriter final : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(gpu::SubgroupIdOp op, + PatternRewriter &rewriter) const override { + // Calculation of the thread's subgroup identifier. + // + // The process involves mapping the thread's 3D identifier within its + // block (b_id.x, b_id.y, b_id.z) to a 1D linear index. + // This linearization assumes a layout where the x-dimension (w_dim.x) + // varies most rapidly (i.e., it is the innermost dimension). + // + // The formula for the linearized thread index is: + // L = tid.x + dim.x * (tid.y + (dim.y * tid.z)) + // + // Subsequently, the range of linearized indices [0, N_threads-1] is + // divided into consecutive, non-overlapping segments, each representing + // a subgroup of size 'subgroup_size'. + // + // Example Partitioning (N = subgroup_size): + // | Subgroup 0 | Subgroup 1 | Subgroup 2 | ... | + // | Indices 0..N-1 | Indices N..2N-1 | Indices 2N..3N-1| ... | + // + // The subgroup identifier is obtained via integer division of the + // linearized thread index by the predefined 'subgroup_size'. + // + // subgroup_id = floor( L / subgroup_size ) + // = (tid.x + dim.x * (tid.y + dim.y * tid.z)) / + // subgroup_size + + Location loc = op->getLoc(); + + Value dimX = rewriter.create(loc, gpu::Dimension::x); + Value dimY = rewriter.create(loc, gpu::Dimension::y); + Value tidX = rewriter.create(loc, gpu::Dimension::x); + Value tidY = rewriter.create(loc, gpu::Dimension::y); + Value tidZ = rewriter.create(loc, gpu::Dimension::z); + + Value dimYxIdZ = rewriter.create(loc, dimY, tidZ); + Value dimYxIdZPlusIdY = rewriter.create(loc, dimYxIdZ, tidY); + Value dimYxIdZPlusIdYTimesDimX = + rewriter.create(loc, dimX, dimYxIdZPlusIdY); + Value IdXPlusDimYxIdZPlusIdYTimesDimX = + rewriter.create(loc, tidX, dimYxIdZPlusIdYTimesDimX); + Value subgroupSize = rewriter.create( + loc, rewriter.getIndexType(), /*upper_bound = */ nullptr); + Value subgroupIdOp = rewriter.create( + loc, IdXPlusDimYxIdZPlusIdYTimesDimX, subgroupSize); + rewriter.replaceOp(op, {subgroupIdOp}); + return success(); + } +}; + +} // namespace + +void mlir::populateGpuSubgroupIdPatterns(RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} diff --git a/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir b/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir new file mode 100644 index 0000000000000..a0c852f6fbe88 --- /dev/null +++ b/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir @@ -0,0 +1,24 @@ +// RUN: mlir-opt --test-gpu-rewrite -split-input-file %s | FileCheck %s + +// CHECK-LABEL: func.func @subgroupId +// CHECK-SAME: (%[[SZ:.*]]: index, %[[MEM:.*]]: memref) { +func.func @subgroupId(%sz : index, %mem: memref) { + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz) + threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) { + // CHECK: %[[DIMX:.*]] = gpu.block_dim x + // CHECK-NEXT: %[[DIMY:.*]] = gpu.block_dim y + // CHECK-NEXT: %[[TIDX:.*]] = gpu.thread_id x + // CHECK-NEXT: %[[TIDY:.*]] = gpu.thread_id y + // CHECK-NEXT: %[[TIDZ:.*]] = gpu.thread_id z + // CHECK-NEXT: %[[T0:.*]] = index.mul %[[DIMY]], %[[TIDZ]] + // CHECK-NEXT: %[[T1:.*]] = index.add %[[T0]], %[[TIDY]] + // CHECK-NEXT: %[[T2:.*]] = index.mul %[[DIMX]], %[[T1]] + // CHECK-NEXT: %[[T3:.*]] = index.add %[[TIDX]], %[[T2]] + // CHECK-NEXT: %[[T4:.*]] = gpu.subgroup_size : index + // CHECK-NEXT: %[[T5:.*]] = index.divu %[[T3]], %[[T4]] + %idz = gpu.subgroup_id : index + memref.store %idz, %mem[] : memref + gpu.terminator + } + return +} diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp index fe402da4cc105..616f458e4824c 100644 --- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp +++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp @@ -41,6 +41,7 @@ struct TestGpuRewritePass void runOnOperation() override { RewritePatternSet patterns(&getContext()); populateGpuRewritePatterns(patterns); + populateGpuSubgroupIdPatterns(patterns); (void)applyPatternsGreedily(getOperation(), std::move(patterns)); } };