From 7c0e3a50fde54ab3179b99c1e3342f4cc3b3ae39 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Sat, 19 Apr 2025 00:51:15 -0400 Subject: [PATCH 1/2] [MLIR][ROCDL] Add conversion for gpu.subgroup_id to ROCDL --- .../Conversion/GPUToROCDL/GPUToROCDLPass.h | 7 +- mlir/include/mlir/Conversion/Passes.td | 4 + .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 143 +++++++++++------- .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 22 +++ 4 files changed, 118 insertions(+), 58 deletions(-) diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h index 291b809071ce9..1b265ecfc48e7 100644 --- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h +++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h @@ -10,6 +10,7 @@ #include "mlir/Conversion/GPUToROCDL/Runtimes.h" #include "mlir/Conversion/LLVMCommon/LoweringOptions.h" +#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" #include namespace mlir { @@ -46,11 +47,7 @@ void configureGpuToROCDLConversionLegality(ConversionTarget &target); /// index bitwidth used for the lowering of the device side index computations /// is configurable. std::unique_ptr> -createLowerGpuOpsToROCDLOpsPass( - const std::string &chipset = "gfx900", - unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout, - bool useBarePtrCallConv = false, - gpu::amd::Runtime runtime = gpu::amd::Runtime::Unknown); +createLowerGpuOpsToROCDLOpsPass(); } // namespace mlir diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index bbba495e613b2..a558aeffba5e4 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -608,6 +608,10 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> { clEnumValN(::mlir::gpu::amd::Runtime::HIP, "HIP", "HIP"), clEnumValN(::mlir::gpu::amd::Runtime::OpenCL, "OpenCL", "OpenCL"))}]>, + Option<"subgroupSize", "subgroup-size", "unsigned", + "0", + "specify subgroup size for the kernel, if left empty, the default " + "value will be decided by the target chipset.">, ListOption<"allowedDialects", "allowed-dialects", "std::string", "Run conversion patterns of only the specified dialects">, ]; diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index dd16ec4b73e9f..932063aa109ba 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -52,25 +52,6 @@ namespace mlir { using namespace mlir; -// Truncate or extend the result depending on the index bitwidth specified -// by the LLVMTypeConverter options. -static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, - Location loc, Value value, - const LLVMTypeConverter &converter) { - int64_t intWidth = cast(value.getType()).getWidth(); - int64_t indexBitwidth = converter.getIndexTypeBitwidth(); - auto indexBitwidthType = - IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth()); - // TODO: use <=> in C++20. - if (indexBitwidth > intWidth) { - return rewriter.create(loc, indexBitwidthType, value); - } - if (indexBitwidth < intWidth) { - return rewriter.create(loc, indexBitwidthType, value); - } - return value; -} - /// Returns true if the given `gpu.func` can be safely called using the bare /// pointer calling convention. static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) { @@ -99,6 +80,26 @@ static constexpr StringLiteral amdgcnDataLayout = "64-S32-A5-G1-ni:7:8:9"; namespace { + +// Truncate or extend the result depending on the index bitwidth specified +// by the LLVMTypeConverter options. +static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, + Location loc, Value value, + const LLVMTypeConverter &converter) { + int64_t intWidth = cast(value.getType()).getWidth(); + int64_t indexBitwidth = converter.getIndexTypeBitwidth(); + auto indexBitwidthType = + IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth()); + // TODO: use <=> in C++20. + if (indexBitwidth > intWidth) { + return rewriter.create(loc, indexBitwidthType, value); + } + if (indexBitwidth < intWidth) { + return rewriter.create(loc, indexBitwidthType, value); + } + return value; +} + struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; @@ -117,16 +118,7 @@ struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern { rewriter.create(loc, intTy, ValueRange{minus1, zero}); Value laneId = rewriter.create( loc, intTy, ValueRange{minus1, mbcntLo}); - // Truncate or extend the result depending on the index bitwidth specified - // by the LLVMTypeConverter options. - const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth(); - if (indexBitwidth > 32) { - laneId = rewriter.create( - loc, IntegerType::get(context, indexBitwidth), laneId); - } else if (indexBitwidth < 32) { - laneId = rewriter.create( - loc, IntegerType::get(context, indexBitwidth), laneId); - } + laneId = truncOrExtToLLVMType(rewriter, loc, laneId, *getTypeConverter()); rewriter.replaceOp(op, {laneId}); return success(); } @@ -150,11 +142,11 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern { /*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32, /*upper=*/op.getUpperBoundAttr().getInt() + 1); } - Value wavefrontOp = rewriter.create( + Value wavefrontSizeOp = rewriter.create( op.getLoc(), rewriter.getI32Type(), bounds); - wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp, - *getTypeConverter()); - rewriter.replaceOp(op, {wavefrontOp}); + wavefrontSizeOp = truncOrExtToLLVMType( + rewriter, op.getLoc(), wavefrontSizeOp, *getTypeConverter()); + rewriter.replaceOp(op, {wavefrontSizeOp}); return success(); } @@ -239,6 +231,65 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern { } }; +struct GPUSubgroupIdOpToROCDL final + : ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // Calculation of the thread's subgroup identifier. + // + // The process involves mapping the thread's 3D identifier within its + // workgroup/block (w_id.x, w_id.y, w_id.z) to a 1D linear index. + // This linearization assumes a layout where the x-dimension (w_dim.x) + // varies most rapidly (i.e., it is the innermost dimension). + // + // The formula for the linearized thread index is: + // L = w_id.x + w_dim.x * (w_id.y + (w_dim.y * w_id.z)) + // + // Subsequently, the range of linearized indices [0, N_threads-1] is + // divided into consecutive, non-overlapping segments, each representing + // a subgroup of size 'subgroup_size'. + // + // Example Partitioning (N = subgroup_size): + // | Subgroup 0 | Subgroup 1 | Subgroup 2 | ... | + // | Indices 0..N-1 | Indices N..2N-1 | Indices 2N..3N-1| ... | + // + // The subgroup identifier is obtained via integer division of the + // linearized thread index by the predefined 'subgroup_size'. + // + // subgroup_id = floor( L / subgroup_size ) + // = (w_id.x + w_dim.x * (w_id.y + w_dim.y * w_id.z)) / + // subgroup_size + auto int32Type = IntegerType::get(rewriter.getContext(), 32); + Location loc = op.getLoc(); + LLVM::IntegerOverflowFlags flags = + LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw; + Value workitemIdX = rewriter.create(loc, int32Type); + Value workitemIdY = rewriter.create(loc, int32Type); + Value workitemIdZ = rewriter.create(loc, int32Type); + Value workitemDimX = rewriter.create(loc, int32Type); + Value workitemDimY = rewriter.create(loc, int32Type); + Value dimYxIdZ = rewriter.create(loc, int32Type, workitemDimY, + workitemIdZ, flags); + Value dimYxIdZPlusIdY = rewriter.create( + loc, int32Type, dimYxIdZ, workitemIdY, flags); + Value dimYxIdZPlusIdYTimesDimX = rewriter.create( + loc, int32Type, workitemDimX, dimYxIdZPlusIdY, flags); + Value workitemIdXPlusDimYxIdZPlusIdYTimesDimX = + rewriter.create(loc, int32Type, workitemIdX, + dimYxIdZPlusIdYTimesDimX, flags); + Value subgroupSize = rewriter.create( + loc, rewriter.getI32Type(), nullptr); + Value waveIdOp = rewriter.create( + loc, workitemIdXPlusDimYxIdZPlusIdYTimesDimX, subgroupSize); + rewriter.replaceOp(op, {truncOrExtToLLVMType(rewriter, loc, waveIdOp, + *getTypeConverter())}); + return success(); + } +}; + /// Import the GPU Ops to ROCDL Patterns. #include "GPUToROCDL.cpp.inc" @@ -249,19 +300,7 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern { // code. struct LowerGpuOpsToROCDLOpsPass final : public impl::ConvertGpuOpsToROCDLOpsBase { - LowerGpuOpsToROCDLOpsPass() = default; - LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth, - bool useBarePtrCallConv, - gpu::amd::Runtime runtime) { - if (this->chipset.getNumOccurrences() == 0) - this->chipset = chipset; - if (this->indexBitwidth.getNumOccurrences() == 0) - this->indexBitwidth = indexBitwidth; - if (this->useBarePtrCallConv.getNumOccurrences() == 0) - this->useBarePtrCallConv = useBarePtrCallConv; - if (this->runtime.getNumOccurrences() == 0) - this->runtime = runtime; - } + using Base::Base; void getDependentDialects(DialectRegistry ®istry) const override { Base::getDependentDialects(registry); @@ -455,17 +494,15 @@ void mlir::populateGpuToROCDLConversionPatterns( // TODO: Add alignment for workgroup memory patterns.add(converter); - patterns.add(converter); + patterns + .add( + converter); patterns.add(converter, chipset); populateMathToROCDLConversionPatterns(converter, patterns); } std::unique_ptr> -mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset, - unsigned indexBitwidth, - bool useBarePtrCallConv, - gpu::amd::Runtime runtime) { - return std::make_unique( - chipset, indexBitwidth, useBarePtrCallConv, runtime); +mlir::createLowerGpuOpsToROCDLOpsPass() { + return std::make_unique(); } diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir index d28aa9e34c22a..e8868aeda4dcb 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -763,3 +763,25 @@ gpu.module @test_module { gpu.module @test_custom_data_layout attributes {llvm.data_layout = "e"} { } + +// ----- + +gpu.module @test_module { + // CHECK-LABEL: func @gpu_subgroup_id() + func.func @gpu_subgroup_id() -> (index) { + // CHECK: %[[widx:.*]] = rocdl.workitem.id.x : i32 + // CHECK: %[[widy:.*]] = rocdl.workitem.id.y : i32 + // CHECK: %[[widz:.*]] = rocdl.workitem.id.z : i32 + // CHECK: %[[dimx:.*]] = rocdl.workgroup.dim.x : i32 + // CHECK: %[[dimy:.*]] = rocdl.workgroup.dim.y : i32 + // CHECK: %[[int5:.*]] = llvm.mul %[[dimy]], %[[widz]] overflow : i32 + // CHECK: %[[int6:.*]] = llvm.add %[[int5]], %[[widy]] overflow : i32 + // CHECK: %[[int7:.*]] = llvm.mul %[[dimx]], %[[int6]] overflow : i32 + // CHECK: %[[int8:.*]] = llvm.add %[[widx]], %[[int7]] overflow : i32 + // CHECK: %[[wavefrontsize:.*]] = rocdl.wavefrontsize : i32 + // CHECK: %[[result:.*]] = llvm.udiv %[[int8]], %[[wavefrontsize]] : i32 + // CHECK: = llvm.sext %[[result]] : i32 to i64 + %subgroupId = gpu.subgroup_id : index + func.return %subgroupId : index + } +} From 293f6245a5bfe467fcfec91f1f98142ac4c5a09a Mon Sep 17 00:00:00 2001 From: Alan Li Date: Mon, 28 Apr 2025 09:40:50 -0400 Subject: [PATCH 2/2] Another update --- mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h | 1 - mlir/include/mlir/Conversion/Passes.td | 4 ---- mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 3 +-- 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h index 1b265ecfc48e7..b6d051cd0af96 100644 --- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h +++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h @@ -10,7 +10,6 @@ #include "mlir/Conversion/GPUToROCDL/Runtimes.h" #include "mlir/Conversion/LLVMCommon/LoweringOptions.h" -#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" #include namespace mlir { diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index a558aeffba5e4..bbba495e613b2 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -608,10 +608,6 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> { clEnumValN(::mlir::gpu::amd::Runtime::HIP, "HIP", "HIP"), clEnumValN(::mlir::gpu::amd::Runtime::OpenCL, "OpenCL", "OpenCL"))}]>, - Option<"subgroupSize", "subgroup-size", "unsigned", - "0", - "specify subgroup size for the kernel, if left empty, the default " - "value will be decided by the target chipset.">, ListOption<"allowedDialects", "allowed-dialects", "std::string", "Run conversion patterns of only the specified dialects">, ]; diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index 932063aa109ba..7e96fd24c3e2f 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -90,7 +90,6 @@ static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, int64_t indexBitwidth = converter.getIndexTypeBitwidth(); auto indexBitwidthType = IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth()); - // TODO: use <=> in C++20. if (indexBitwidth > intWidth) { return rewriter.create(loc, indexBitwidthType, value); } @@ -281,7 +280,7 @@ struct GPUSubgroupIdOpToROCDL final rewriter.create(loc, int32Type, workitemIdX, dimYxIdZPlusIdYTimesDimX, flags); Value subgroupSize = rewriter.create( - loc, rewriter.getI32Type(), nullptr); + loc, rewriter.getI32Type(), /*upper_bound = */ nullptr); Value waveIdOp = rewriter.create( loc, workitemIdXPlusDimYxIdZPlusIdYTimesDimX, subgroupSize); rewriter.replaceOp(op, {truncOrExtToLLVMType(rewriter, loc, waveIdOp,