diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h index 1a917932a9a84..291b809071ce9 100644 --- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h +++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h @@ -20,6 +20,10 @@ class RewritePatternSet; template class OperationPass; +namespace amdgpu { +struct Chipset; +} // namespace amdgpu + namespace gpu { class GPUModuleOp; } // namespace gpu @@ -32,7 +36,8 @@ class GPUModuleOp; /// The resulting pattern set should be run over a gpu.module op void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, - gpu::amd::Runtime runtime); + gpu::amd::Runtime runtime, + amdgpu::Chipset chipset); /// Configure target to convert from the GPU dialect to ROCDL. void configureGpuToROCDLConversionLegality(ConversionTarget &target); diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 186a4f53f93cb..93e59e0e7e6be 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -216,6 +216,8 @@ def ROCDL_BlockIdXOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.x">; def ROCDL_BlockIdYOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.y">; def ROCDL_BlockIdZOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.z">; +def ROCDL_WavefrontSizeOp : ROCDL_SpecialIdRegisterOp<"wavefrontsize">; + //===----------------------------------------------------------------------===// // Thread range and Block range //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index e6dd6f135884e..6b180860ff4eb 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -52,6 +52,25 @@ namespace mlir { using namespace mlir; +// Truncate or extend the result depending on the index bitwidth specified +// by the LLVMTypeConverter options. +static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, + Location loc, Value value, + const LLVMTypeConverter &converter) { + int64_t intWidth = cast(value.getType()).getWidth(); + int64_t indexBitwidth = converter.getIndexTypeBitwidth(); + auto indexBitwidthType = + IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth()); + // TODO: use <=> in C++20. + if (indexBitwidth > intWidth) { + return rewriter.create(loc, indexBitwidthType, value); + } + if (indexBitwidth < intWidth) { + return rewriter.create(loc, indexBitwidthType, value); + } + return value; +} + /// Returns true if the given `gpu.func` can be safely called using the bare /// pointer calling convention. static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) { @@ -113,6 +132,35 @@ struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern { } }; +struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + GPUSubgroupSizeOpToROCDL(const LLVMTypeConverter &converter, + amdgpu::Chipset chipset) + : ConvertOpToLLVMPattern(converter), + chipset(chipset) {} + + LogicalResult + matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + LLVM::ConstantRangeAttr bounds = nullptr; + bool isBeforeGfx10 = chipset.majorVersion < 10; + if (auto upperBoundAttr = op.getUpperBoundAttr()) { + bounds = rewriter.getAttr( + /*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32, + /*upper=*/op.getUpperBoundAttr().getInt() + 1); + } + Value wavefrontOp = rewriter.create( + op.getLoc(), rewriter.getI32Type(), bounds); + wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp, + *getTypeConverter()); + rewriter.replaceOp(op, {wavefrontOp}); + return success(); + } + + const amdgpu::Chipset chipset; +}; + struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; @@ -319,7 +367,8 @@ struct LowerGpuOpsToROCDLOpsPass final populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns, *maybeChipset); - populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime); + populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime, + *maybeChipset); configureGpuToROCDLConversionLegality(target); if (failed(applyPartialConversion(m, target, std::move(llvmPatterns)))) signalPassFailure(); @@ -367,7 +416,7 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) { void mlir::populateGpuToROCDLConversionPatterns( const LLVMTypeConverter &converter, RewritePatternSet &patterns, - mlir::gpu::amd::Runtime runtime) { + mlir::gpu::amd::Runtime runtime, amdgpu::Chipset chipset) { using gpu::index_lowering::IndexKind; using gpu::index_lowering::IntrType; using mlir::gpu::amd::Runtime; @@ -405,7 +454,10 @@ void mlir::populateGpuToROCDLConversionPatterns( // TODO: Add alignment for workgroup memory patterns.add(converter); - patterns.add(converter); + patterns + .add( + converter); + patterns.add(converter, chipset); populateMathToROCDLConversionPatterns(converter, patterns); } diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir index 071cae9d5789f..4cb35a458fcfa 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -11,7 +11,7 @@ gpu.module @test_module { func.func @gpu_index_ops() -> (index, index, index, index, index, index, index, index, index, index, index, index, - index) { + index, index, index) { // CHECK32-NOT: = llvm.sext %{{.*}} : i32 to i64 // CHECK: rocdl.workitem.id.x : i32 @@ -59,12 +59,20 @@ gpu.module @test_module { // CHECK: = llvm.sext %{{.*}} : i32 to i64 %laneId = gpu.lane_id + // CHECK: = rocdl.wavefrontsize : i32 + // CHECK: = llvm.sext %{{.*}} : i32 to i64 + %subgroupSize = gpu.subgroup_size : index + + // CHECK: = rocdl.wavefrontsize range : i32 + // CHECK: = llvm.sext %{{.*}} : i32 to i64 + %subgroupSize2 = gpu.subgroup_size upper_bound 64 : index + func.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ, %bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ, - %laneId + %laneId, %subgroupSize, %subgroupSize2 : index, index, index, index, index, index, index, index, index, index, index, index, - index + index, index, index } } diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 3db1f7b2b6427..af47582dd0bfb 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -32,6 +32,13 @@ llvm.func @rocdl_special_regs() -> i32 { // CHECK: call range(i64 1, 65) i64 @__ockl_get_local_size(i32 0) %14 = rocdl.workgroup.dim.x range : i64 + + // CHECK: call i32 @llvm.amdgcn.wavefrontsize() + %15 = rocdl.wavefrontsize : i32 + + // CHECK: call range(i32 32, 65) i32 @llvm.amdgcn.wavefrontsize() + %16 = rocdl.wavefrontsize range : i32 + llvm.return %1 : i32 }