From fcd98013cfe99eea6a0e1b9fad08ba4b40ffdb2b Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Fri, 17 Oct 2025 20:39:47 +0000 Subject: [PATCH 1/3] Generalize gpu.printf op too llvm call lowering pattern for usage cases other than AMD gpu OpenCL runtime. --- .../Conversion/GPUCommon/GPUOpsLowering.cpp | 6 ++-- .../lib/Conversion/GPUCommon/GPUOpsLowering.h | 15 +++++++--- .../Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp | 5 +++- mlir/test/Conversion/GPUToLLVMSPV/printf.mlir | 16 ++++++++++ .../Dialect/XeVM/GPU/gpu_printf.mlir | 29 +++++++++++++++++++ 5 files changed, 64 insertions(+), 7 deletions(-) create mode 100644 mlir/test/Conversion/GPUToLLVMSPV/printf.mlir create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp index 2285d2695db4e..eb662a1b056de 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp @@ -507,7 +507,8 @@ LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite( LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {ptrType}, /*isVarArg=*/true); LLVM::LLVMFuncOp printfDecl = - getOrDefineFunction(moduleOp, loc, rewriter, "printf", printfType); + getOrDefineFunction(moduleOp, loc, rewriter, funcName, printfType); + printfDecl.setCConv(callingConvention); // Create the global op or find an existing one. LLVM::GlobalOp global = getOrCreateStringConstant( @@ -530,7 +531,8 @@ LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite( printfArgs.push_back(stringStart); printfArgs.append(argsRange.begin(), argsRange.end()); - LLVM::CallOp::create(rewriter, loc, printfDecl, printfArgs); + auto call = LLVM::CallOp::create(rewriter, loc, printfDecl, printfArgs); + call.setCConv(callingConvention); rewriter.eraseOp(gpuPrintfOp); return success(); } diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h index 66d3bb40a8f5a..adf5ba2feb591 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h @@ -10,6 +10,7 @@ #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMAttrs.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" namespace mlir { @@ -142,13 +143,17 @@ struct GPUPrintfOpToHIPLowering : public ConvertOpToLLVMPattern { /// This pass will add a declaration of printf() to the GPUModule if needed /// and separate out the format strings into global constants. For some /// runtimes, such as OpenCL on AMD, this is sufficient setup, as the compiler -/// will lower printf calls to appropriate device-side code +/// will lower printf calls to appropriate device-side code. +/// callingConvention and funcName can be adjusted as needed. struct GPUPrintfOpToLLVMCallLowering : public ConvertOpToLLVMPattern { - GPUPrintfOpToLLVMCallLowering(const LLVMTypeConverter &converter, - int addressSpace = 0) + GPUPrintfOpToLLVMCallLowering( + const LLVMTypeConverter &converter, int addressSpace = 0, + LLVM::cconv::CConv callingConvention = LLVM::cconv::CConv::C, + StringRef funcName = "printf") : ConvertOpToLLVMPattern(converter), - addressSpace(addressSpace) {} + addressSpace(addressSpace), callingConvention(callingConvention), + funcName(funcName) {} LogicalResult matchAndRewrite(gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor, @@ -156,6 +161,8 @@ struct GPUPrintfOpToLLVMCallLowering private: int addressSpace; + LLVM::cconv::CConv callingConvention; + StringRef funcName; }; /// Lowering of gpu.printf to a vprintf standard library. diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp index c2363a1a40294..29437f1ae5c0c 100644 --- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp +++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp @@ -470,10 +470,13 @@ struct GPUToLLVMSPVConversionPass final gpu::GPUFuncOp, gpu::GlobalIdOp, gpu::GridDimOp, gpu::LaneIdOp, gpu::NumSubgroupsOp, gpu::ReturnOp, gpu::ShuffleOp, gpu::SubgroupIdOp, gpu::SubgroupSizeOp, - gpu::ThreadIdOp>(); + gpu::ThreadIdOp, gpu::PrintfOp>(); populateGpuToLLVMSPVConversionPatterns(converter, patterns); populateGpuMemorySpaceAttributeConversions(converter); + patterns.add(converter, /*addressSpace=*/2, + LLVM::cconv::CConv::SPIR_FUNC, + "Z6printfPU3AS2Kcz"); if (failed(applyPartialConversion(getOperation(), target, std::move(patterns)))) diff --git a/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir b/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir new file mode 100644 index 0000000000000..1b17da9f4eeee --- /dev/null +++ b/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir @@ -0,0 +1,16 @@ +// RUN: mlir-opt %s -convert-gpu-to-llvm-spv | FileCheck %s + +gpu.module @test_module { + // CHECK: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00") {addr_space = 2 : i32} + // CHECK: llvm.func spir_funccc @Z6printfPU3AS2Kcz(!llvm.ptr<2>, ...) -> i32 + // CHECK-LABEL: llvm.func spir_funccc @test_printf + // CHECK: (%[[ARG0:.*]]: i32) + gpu.func @test_printf(%arg0: i32) { + // CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr<2> + // CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr<2>) -> !llvm.ptr<2>, !llvm.array<11 x i8> + // CHECK-NEXT: %{{.*}} = llvm.call spir_funccc @Z6printfPU3AS2Kcz(%[[IMM2]], %[[ARG0]]) vararg(!llvm.func, ...)>) : (!llvm.ptr<2>, i32) -> i32 + gpu.printf "Hello: %d\n", %arg0 : i32 + gpu.return + } +} + diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir new file mode 100644 index 0000000000000..f9c305b04207b --- /dev/null +++ b/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir @@ -0,0 +1,29 @@ +// RUN: mlir-opt %s \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \ +// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ +// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_sycl_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --shared-libs=%mlir_c_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @test attributes {gpu.container_module} { + gpu.module @test_module { + gpu.func @test_printf(%arg0: i32, %arg1: f32) kernel { + gpu.printf "Hello: %d\n", %arg0 : i32 + gpu.printf "Hello: %f\n", %arg1 : f32 + gpu.return + } + } + + func.func @main() attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c11 = arith.constant 11 : i32 + %c4 = arith.constant 4.0 : f32 + // CHECK: "Hello: 11" + gpu.launch_func @test_module::@test_printf blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%c11 : i32, %c4 : f32) + return + } +} From 433d89eb792f36ee17fa4720601c4f2c48e21105 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Mon, 20 Oct 2025 18:25:26 +0000 Subject: [PATCH 2/3] Fix incorrect function name. --- mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp | 2 +- mlir/test/Conversion/GPUToLLVMSPV/printf.mlir | 4 ++-- mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp index 29437f1ae5c0c..25f1e1b184d61 100644 --- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp +++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp @@ -476,7 +476,7 @@ struct GPUToLLVMSPVConversionPass final populateGpuMemorySpaceAttributeConversions(converter); patterns.add(converter, /*addressSpace=*/2, LLVM::cconv::CConv::SPIR_FUNC, - "Z6printfPU3AS2Kcz"); + "_Z6printfPU3AS2Kcz"); if (failed(applyPartialConversion(getOperation(), target, std::move(patterns)))) diff --git a/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir b/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir index 1b17da9f4eeee..74017e8354cf1 100644 --- a/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir +++ b/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir @@ -2,13 +2,13 @@ gpu.module @test_module { // CHECK: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00") {addr_space = 2 : i32} - // CHECK: llvm.func spir_funccc @Z6printfPU3AS2Kcz(!llvm.ptr<2>, ...) -> i32 + // CHECK: llvm.func spir_funccc @_Z6printfPU3AS2Kcz(!llvm.ptr<2>, ...) -> i32 // CHECK-LABEL: llvm.func spir_funccc @test_printf // CHECK: (%[[ARG0:.*]]: i32) gpu.func @test_printf(%arg0: i32) { // CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr<2> // CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr<2>) -> !llvm.ptr<2>, !llvm.array<11 x i8> - // CHECK-NEXT: %{{.*}} = llvm.call spir_funccc @Z6printfPU3AS2Kcz(%[[IMM2]], %[[ARG0]]) vararg(!llvm.func, ...)>) : (!llvm.ptr<2>, i32) -> i32 + // CHECK-NEXT: %{{.*}} = llvm.call spir_funccc @_Z6printfPU3AS2Kcz(%[[IMM2]], %[[ARG0]]) vararg(!llvm.func, ...)>) : (!llvm.ptr<2>, i32) -> i32 gpu.printf "Hello: %d\n", %arg0 : i32 gpu.return } diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir index f9c305b04207b..edf8775c72418 100644 --- a/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir +++ b/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir @@ -22,7 +22,8 @@ module @test attributes {gpu.container_module} { %c1 = arith.constant 1 : index %c11 = arith.constant 11 : i32 %c4 = arith.constant 4.0 : f32 - // CHECK: "Hello: 11" + // CHECK: Hello: 11 + // CHECK: Hello: 4.000000 gpu.launch_func @test_module::@test_printf blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%c11 : i32, %c4 : f32) return } From f7b4310e58a93baeb5b76eabeb1404544e591dcd Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Wed, 22 Oct 2025 20:59:24 +0000 Subject: [PATCH 3/3] Add explanation on why calling convention and function name needs to support customization. --- mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h index adf5ba2feb591..ec74787b2a8ed 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h @@ -144,7 +144,13 @@ struct GPUPrintfOpToHIPLowering : public ConvertOpToLLVMPattern { /// and separate out the format strings into global constants. For some /// runtimes, such as OpenCL on AMD, this is sufficient setup, as the compiler /// will lower printf calls to appropriate device-side code. -/// callingConvention and funcName can be adjusted as needed. +/// However not all backends use the same calling convention and function +/// naming. +/// For example, the LLVM SPIRV backend requires calling convention +/// LLVM::cconv::CConv::SPIR_FUNC and function name needs to be +/// mangled as "_Z6printfPU3AS2Kcz". +/// Default callingConvention is LLVM::cconv::CConv::C and +/// funcName is "printf" but they can be customized as needed. struct GPUPrintfOpToLLVMCallLowering : public ConvertOpToLLVMPattern { GPUPrintfOpToLLVMCallLowering(